General information
Two Sigma金融新闻竞赛是一项独特的竞赛:不仅仅是内核竞争,而且我们不应该下载数据,在第二阶段,我们的解决方案将用于预测未来的真实数据。
我将尝试为本次比赛进行广泛的EDA,并尝试找到一些有关数据的有趣内容。
P. S.我正在学习使用plotly,所以最后会有交互式图表!
获取数据和导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inlineimport datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
stop = set(stopwords.words('english'))import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tlsfrom xgboost import XGBClassifier
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
# official way to get the data
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()
print('Done!')
(market_train_df, news_train_df) = env.get_training_data()
我们有两个数据集,让我们分别探讨它们。
市场数据
我们有一个非常有趣的数据集,其中包含十多年来许多公司的股票价格!
现在让我们来看看数据本身而不是考虑竞争对手。 我们可以看到长期趋势,出现和衰落的公司和许多其他事情。
print(f'{market_train_df.shape[0]} samples and {market_train_df.shape[1]} features in the training market dataset.')
market_train_df.head()
首先让我们取10个随机资产并绘制它们。
data = []
for asset in np.random.choice(market_train_df['assetName'].unique(), 10):asset_df = market_train_df[(market_train_df['assetName'] == asset)]data.append(go.Scatter(x = asset_df['time'].dt.strftime(date_format='%Y-%m-%d').values,y = asset_df['close'].values,name = asset))
layout = go.Layout(dict(title = "Closing prices of 10 random assets",xaxis = dict(title = 'Month'),yaxis = dict(title = 'Price (USD)'),),legend=dict(orientation="h"))
py.iplot(dict(data=data, layout=layout), filename='basic-line')
我绘制所有时期的数据,因为我想展示长期趋势。 资产随机抽样,但你应该看到一些公司的股票开始交易较晚,有些公司股票消失了。 失踪可能是由于破产,收购或其他原因造成的。
嗯,这些是一些随机的公司。 但看到价格的总体趋势会更有趣
data = []
#market_train_df['close'] = market_train_df['close'] / 20
for i in [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]:price_df = market_train_df.groupby('time')['close'].quantile(i).reset_index()data.append(go.Scatter(x = price_df['time'].dt.strftime(date_format='%Y-%m-%d').values,y = price_df['close'].values,name = f'{i} quantile'))
layout = go.Layout(dict(title = "Trends of closing prices by quantiles",xaxis = dict(title = 'Month'),yaxis = dict(title = 'Price (USD)'),),legend=dict(orientation="h"))
py.iplot(dict(data=data, layout=layout), filename='basic-line')
能够看到市场如何下跌并再次上涨是很酷的。 当市场出现严重的股价下跌时,我已经展示了4个事件。 您还可以注意到,较高的分位数价格随着时间的推移而增加,较低的分位数价格下降。 也许穷人和富人之间的差距会增加…另一方面,也许更多的“小”公司准备进入市场,他们的股票价格也不是很高。
现在,我们来看看这些价格下降的细节。
market_train_df['price_diff'] = market_train_df['close'] - market_train_df['open']
grouped = market_train_df.groupby('time').agg({'price_diff': ['std', 'min']}).reset_index()
print(f"Average standard deviation of price change within a day in {grouped['price_diff']['std'].mean():.4f}.")
g = grouped.sort_values(('price_diff', 'std'), ascending=False)[:10]
g['min_text'] = 'Maximum price drop: ' + (-1 * g['price_diff']['min']).astype(str)
trace = go.Scatter(x = g['time'].dt.strftime(date_format='%Y-%m-%d').values,y = g['price_diff']['std'].values,mode='markers',marker=dict(size = g['price_diff']['std'].values,color = g['price_diff']['std'].values,colorscale='Portland',showscale=True),text = g['min_text'].values#text = f"Maximum price drop: {g['price_diff']['min'].values}"#g['time'].dt.strftime(date_format='%Y-%m-%d').values
)
data = [trace]layout= go.Layout(autosize= True,title= 'Top 10 months by standard deviation of price change within a day',hovermode= 'closest',yaxis=dict(title= 'price_diff',ticklen= 5,gridwidth= 2,),showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
当市场崩溃时,我们可以看到巨大的价格波动。 想一想…但这是错的! 2010年1月没有发生大的崩溃…让我们深入了解数据!
可能的数据错误
首先,让我们根据开盘价和收盘价之间的差异对数据进行排序。
market_train_df.sort_values('price_diff')[:10]
因此,“Towers Watson&Co”股票的价格几乎是10k …我认为这只是数据中的一个错误。
但是纽约梅隆银行呢?
让我们看看雅虎的数据:
没有尖峰。
另一个案例是成本等于999,这些数字通常是可疑的。 让我们来看看Archrock Inc - 那里也没有尖峰。
所以,让我们试着寻找奇怪的案例。
market_train_df['close_to_open'] = np.abs(market_train_df['close'] / market_train_df['open'])
print(f"In {(market_train_df['close_to_open'] >= 1.2).sum()} lines price increased by 20% or more.")
print(f"In {(market_train_df['close_to_open'] <= 0.8).sum()} lines price decreased by 20% or more.")
嗯,考虑到我们有超过400万条生产线,并且很多这些情况都是由于市场崩盘期间的价格下跌所致。 好吧,只需要处理异常值。
print(f"In {(market_train_df['close_to_open'] >= 2).sum()} lines price increased by 100% or more.")
print(f"In {(market_train_df['close_to_open'] <= 0.5).sum()} lines price decreased by 100% or more.")
为了快速解决这个问题,我将用这家公司的平均开盘价或收盘价替换这些线中的异常值。
market_train_df['assetName_mean_open'] = market_train_df.groupby('assetName')['open'].transform('mean')
market_train_df['assetName_mean_close'] = market_train_df.groupby('assetName')['close'].transform('mean')# if open price is too far from mean open price for this company, replace it. Otherwise replace close price.
for i, row in market_train_df.loc[market_train_df['close_to_open'] >= 2].iterrows():if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):market_train_df.iloc[i,5] = row['assetName_mean_open']else:market_train_df.iloc[i,4] = row['assetName_mean_close']for i, row in market_train_df.loc[market_train_df['close_to_open'] <= 0.5].iterrows():if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):market_train_df.iloc[i,5] = row['assetName_mean_open']else:market_train_df.iloc[i,4] = row['assetName_mean_close']
再次创建图
market_train_df['price_diff'] = market_train_df['close'] - market_train_df['open']
grouped = market_train_df.groupby(['time']).agg({'price_diff': ['std', 'min']}).reset_index()
g = grouped.sort_values(('price_diff', 'std'), ascending=False)[:10]
g['min_text'] = 'Maximum price drop: ' + (-1 * np.round(g['price_diff']['min'], 2)).astype(str)
trace = go.Scatter(x = g['time'].dt.strftime(date_format='%Y-%m-%d').values,y = g['price_diff']['std'].values,mode='markers',marker=dict(size = g['price_diff']['std'].values * 5,color = g['price_diff']['std'].values,colorscale='Portland',showscale=True),text = g['min_text'].values#text = f"Maximum price drop: {g['price_diff']['min'].values}"#g['time'].dt.strftime(date_format='%Y-%m-%d').values
)
data = [trace]layout= go.Layout(autosize= True,title= 'Top 10 months by standard deviation of price change within a day',hovermode= 'closest',yaxis=dict(title= 'price_diff',ticklen= 5,gridwidth= 2,),showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
现在图表更加合理。
我们观察一下目标变量
data = []
for i in [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]:price_df = market_train_df.groupby('time')['returnsOpenNextMktres10'].quantile(i).reset_index()data.append(go.Scatter(x = price_df['time'].dt.strftime(date_format='%Y-%m-%d').values,y = price_df['returnsOpenNextMktres10'].values,name = f'{i} quantile'))
layout = go.Layout(dict(title = "Trends of returnsOpenNextMktres10 by quantiles",xaxis = dict(title = 'Month'),yaxis = dict(title = 'Price (USD)'),),legend=dict(orientation="h"),)
py.iplot(dict(data=data, layout=layout), filename='basic-line')
我们可以看到分位数具有较高的偏差,但平均值变化不大。
现在我认为是时候抛出旧数据集了。 让我们自2010年以来只留下数据,这样我们就可以摆脱最大危机的数据。
我们现在来看看目标变量。
data = []
market_train_df = market_train_df.loc[market_train_df['time'] >= '2010-01-01 22:00:00+0000']price_df = market_train_df.groupby('time')['returnsOpenNextMktres10'].mean().reset_index()data.append(go.Scatter(x = price_df['time'].dt.strftime(date_format='%Y-%m-%d').values,y = price_df['returnsOpenNextMktres10'].values,name = f'{i} quantile'
))
layout = go.Layout(dict(title = "Treand of returnsOpenNextMktres10 mean",xaxis = dict(title = 'Month'),yaxis = dict(title = 'Price (USD)'),),legend=dict(orientation="h"),)
py.iplot(dict(data=data, layout=layout), filename='basic-line')
波动似乎很高,但实际上它们低了8%。 事实上,它看起来像一个随机的噪音…
现在让我们记住描述:
The marketdata contains a variety of returns calculated over different timespans. All of the returns in this set of marketdata have these properties:Returns are always calculated either open-to-open (from the opening time of one trading day to the open of another) or close-to-close (from the closing time of one trading day to the open of another).Returns are either raw, meaning that the data is not adjusted against any benchmark, or market-residualized (Mktres), meaning that the movement of the market as a whole has been accounted for, leaving only movements inherent to the instrument.Returns can be calculated over any arbitrary interval. Provided here are 1 day and 10 day horizons.Returns are tagged with 'Prev' if they are backwards looking in time, or 'Next' if forwards looking.
Let’s have a look at means of these variables
data = []
for col in ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1','returnsClosePrevMktres1', 'returnsOpenPrevMktres1','returnsClosePrevRaw10', 'returnsOpenPrevRaw10','returnsClosePrevMktres10', 'returnsOpenPrevMktres10','returnsOpenNextMktres10']:df = market_train_df.groupby('time')[col].mean().reset_index()data.append(go.Scatter(x = df['time'].dt.strftime(date_format='%Y-%m-%d').values,y = df[col].values,name = col))layout = go.Layout(dict(title = "Treand of mean values",xaxis = dict(title = 'Month'),yaxis = dict(title = 'Price (USD)'),),legend=dict(orientation="h"),)
py.iplot(dict(data=data, layout=layout), filename='basic-line')
好吧,对我来说很难解释这一点,但似乎前10天的回报波动最大。
News data
news_train_df.head()
print(f'{news_train_df.shape[0]} samples and {news_train_df.shape[1]} features in the training news dataset.')
该文件太大而无法直接处理文本,所以让我们看看最近100000个标题的wordcloud。
text = ' '.join(news_train_df['headline'].str.lower().values[-1000000:])
wordcloud = WordCloud(max_font_size=None, stopwords=stop, background_color='white',width=1200, height=1000).generate(text)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud)
plt.title('Top words in headline')
plt.axis("off")
plt.show()
# Let's also limit the time period
news_train_df = news_train_df.loc[news_train_df['time'] >= '2010-01-01 22:00:00+0000']
(news_train_df['urgency'].value_counts() / 1000000).plot('bar');
plt.xticks(rotation=30);
plt.title('Urgency counts (mln)');
好吧,事实上似乎紧迫性“2”几乎从未使用过。
news_train_df['sentence_word_count'] = news_train_df['wordCount'] / news_train_df['sentenceCount']
plt.boxplot(news_train_df['sentence_word_count'][news_train_df['sentence_word_count'] < 40]);
有一些很大的异常值,但句子大多有15-25个单词。
news_train_df['provider'].value_counts().head(10)
路透社是最常见的提供者,这并不奇怪 ?
(news_train_df['headlineTag'].value_counts() / 1000)[:10].plot('barh');
plt.title('headlineTag counts (thousands)');
好吧,大多数新闻都是无标签的。
for i, j in zip([-1, 0, 1], ['negative', 'neutral', 'positive']):df_sentiment = news_train_df.loc[news_train_df['sentimentClass'] == i, 'assetName']print(f'Top mentioned companies for {j} sentiment are:')print(df_sentiment.value_counts().head(5))print('')
我认为苹果是一家消极和积极情绪最多的公司,这很有趣。
起初我很遗憾我们无法访问新闻的文本,但我已经意识到,由于内核内存限制,我们无论如何都无法使用它们。
建模
是时候建立一个模型了! 我认为在这种情况下我们应该构建一个二元分类器 - 我们将简单地预测目标是上升还是下降。
#%%time
# code mostly takes from this kernel: https://www.kaggle.com/ashishpatel26/bird-eye-view-of-two-sigma-xgbdef data_prep(market_df,news_df):market_df['time'] = market_df.time.dt.datemarket_df['returnsOpenPrevRaw1_to_volume'] = market_df['returnsOpenPrevRaw1'] / market_df['volume']market_df['close_to_open'] = market_df['close'] / market_df['open']market_df['volume_to_mean'] = market_df['volume'] / market_df['volume'].mean()news_df['sentence_word_count'] = news_df['wordCount'] / news_df['sentenceCount']news_df['time'] = news_df.time.dt.hournews_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hournews_df['firstCreated'] = news_df.firstCreated.dt.datenews_df['assetCodesLen'] = news_df['assetCodes'].map(lambda x: len(eval(x)))news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])news_df['headlineLen'] = news_df['headline'].apply(lambda x: len(x))news_df['assetCodesLen'] = news_df['assetCodes'].apply(lambda x: len(x))news_df['asset_sentiment_count'] = news_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count')news_df['asset_sentence_mean'] = news_df.groupby(['assetName', 'sentenceCount'])['time'].transform('mean')lbl = {k: v for v, k in enumerate(news_df['headlineTag'].unique())}news_df['headlineTagT'] = news_df['headlineTag'].map(lbl)kcol = ['firstCreated', 'assetCodes']news_df = news_df.groupby(kcol, as_index=False).mean()market_df = pd.merge(market_df, news_df, how='left', left_on=['time', 'assetCode'], right_on=['firstCreated', 'assetCodes'])lbl = {k: v for v, k in enumerate(market_df['assetCode'].unique())}market_df['assetCodeT'] = market_df['assetCode'].map(lbl)market_df = market_df.dropna(axis=0)return market_dfmarket_train_df.drop(['price_diff', 'assetName_mean_open', 'assetName_mean_close'], axis=1, inplace=True)
market_train = data_prep(market_train_df, news_train_df)
print(market_train.shape)
up = market_train.returnsOpenNextMktres10 >= 0fcol = [c for c in market_train.columns if c not in ['assetCode', 'assetCodes', 'assetCodesLen', 'assetName', 'assetCodeT','firstCreated', 'headline', 'headlineTag', 'marketCommentary', 'provider','returnsOpenNextMktres10', 'sourceId', 'subjects', 'time', 'time_x', 'universe','sourceTimestamp']]X = market_train[fcol].values
up = up.values
r = market_train.returnsOpenNextMktres10.values# Scaling of X values
mins = np.min(X, axis=0)
maxs = np.max(X, axis=0)
rng = maxs - mins
X = 1 - ((maxs - X) / rng)
X_train, X_test, up_train, up_test, r_train, r_test = model_selection.train_test_split(X, up, r, test_size=0.1, random_state=99)# xgb_up = XGBClassifier(n_jobs=4,
# n_estimators=300,
# max_depth=3,
# eta=0.15,
# random_state=42)
params = {'learning_rate': 0.01, 'max_depth': 12, 'boosting': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'is_training_metric': True, 'seed': 42}
model = lgb.train(params, train_set=lgb.Dataset(X_train, label=up_train), num_boost_round=2000,valid_sets=[lgb.Dataset(X_train, label=up_train), lgb.Dataset(X_test, label=up_test)],verbose_eval=100, early_stopping_rounds=100)
def generate_color():color = '#{:02x}{:02x}{:02x}'.format(*map(lambda x: np.random.randint(0, 255), range(3)))return colordf = pd.DataFrame({'imp': model.feature_importance(), 'col':fcol})
df = df.sort_values(['imp','col'], ascending=[True, False])
data = [df]
for dd in data: colors = []for i in range(len(dd)):colors.append(generate_color())data = [go.Bar(orientation = 'h',x=dd.imp,y=dd.col,name='Features',textfont=dict(size=20),marker=dict(color= colors,line=dict(color='#000000',width=0.5),opacity = 0.87))]layout= go.Layout(title= 'Feature Importance of LGB',xaxis= dict(title='Columns', ticklen=5, zeroline=False, gridwidth=2),yaxis=dict(title='Value Count', ticklen=5, gridwidth=2),showlegend=True)py.iplot(dict(data=data,layout=layout), filename='horizontal-bar')
days = env.get_prediction_days()
import timen_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
for (market_obs_df, news_obs_df, predictions_template_df) in days:n_days +=1if n_days % 50 == 0:print(n_days,end=' ')t = time.time()market_obs_df = data_prep(market_obs_df, news_obs_df)market_obs_df = market_obs_df[market_obs_df.assetCode.isin(predictions_template_df.assetCode)]X_live = market_obs_df[fcol].valuesX_live = 1 - ((maxs - X_live) / rng)prep_time += time.time() - tt = time.time()lp = model.predict(X_live)prediction_time += time.time() -tt = time.time()confidence = 2 * lp -1preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':confidence})predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})env.predict(predictions_template_df)packaging_time += time.time() - tenv.write_submission_file()