Forecast bilateral trade series using importer and exporter node network statistics and XGboost. The goal here is to find what geometric information is the most useful for forcasting. Is any geometric information of use? Does it motive using something like a GNN?
In previous work I've calculated some basic network statistics on IMF Direction of Trade Statistics (DOTS) export data.
I looked for relevant features using univariate linear regression in this notebook
In this notebook I'll use XGBoost.
Table of Contents:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from numpy import absolute
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
from math import ceil
timeSeries=(pd.read_csv('dotsTimeSeries.csv')
.pivot_table(index='period', columns=['ReferenceArea', 'CounterpartReferenceArea'], values='value')
)
tsPctChange=np.log(timeSeries).pct_change().iloc[1:].dropna(axis=1)
tsPctChange.columns=['-'.join(col) for col in tsPctChange.columns]
tsPctChange[tsPctChange>1.5]=np.nan
tsPctChange[tsPctChange<-1.5]=np.nan
tsPctChange=tsPctChange.dropna(axis=1)
tsPctChange.index=pd.to_datetime(tsPctChange.index)
tsPctChange=tsPctChange[tsPctChange.index > '1985-01-01']
netStats=pd.read_csv('DOTSnetStats.csv').drop(['Unnamed: 0', 'CONNECTIVITY', 'HAS_BRIDGE', 'TOTAL_NET_VALUE', 'PAGERANK_NUMPY'],axis=1)
netStats.set_index(['index', 'PERIOD'], inplace=True)
# get to period index and econ, stats cols
netStatsWide=(netStats
.reset_index()
.melt(id_vars=['index', 'PERIOD'])
.pivot_table(index='PERIOD', columns=['index', 'variable'], values='value')
)
netStatsWide.index = pd.to_datetime(netStatsWide.index)
netStatsWidePctChange=netStatsWide.pct_change().iloc[1:].dropna(axis=1)
netStatsWidePctChange.index=pd.to_datetime(netStatsWidePctChange.index)
netStatsWidePctChange=netStatsWidePctChange[netStatsWidePctChange.index > '1985-01-01']
# lag the net stats to not leak information
netStatsWidePctChange=netStatsWidePctChange.shift(-1).iloc[:-1]
# take off a period of time series so sizes match
tsPctChange=tsPctChange.iloc[:-1]
netStats.corr()
DEGREE | IN_DEGREE | OUT_DEGREE | DEGREE_CENTRALITY | IN_DEGREE_CENTRALITY | OUT_DEGREE_CENTRALITY | AVG_NEIGHBOR_DEGREE | PAGERANK | KATZ | CLOSENESS_CENTRALITY | BETWEENNESS_CENTRALITY | CLUSTCOEF | NUM_NODES | NUM_EDGES | AVERAGECLUSTCOEF | TRIANGLES | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
DEGREE | 1.000000 | 0.905947 | 0.962545 | 0.992468 | 0.907749 | 0.943530 | 0.539215 | 0.342109 | 0.030614 | 0.779620 | 0.579524 | -0.582810 | 0.560734 | 0.578295 | -0.366186 | 0.958205 |
IN_DEGREE | 0.905947 | 1.000000 | 0.757224 | 0.879958 | 0.993021 | 0.722659 | 0.424597 | 0.352711 | 0.044777 | 0.888389 | 0.435932 | -0.439187 | 0.675427 | 0.696580 | -0.441086 | 0.926553 |
OUT_DEGREE | 0.962545 | 0.757224 | 1.000000 | 0.967567 | 0.764474 | 0.992800 | 0.559940 | 0.301901 | 0.018554 | 0.633813 | 0.614865 | -0.617850 | 0.432515 | 0.446061 | -0.282453 | 0.884874 |
DEGREE_CENTRALITY | 0.992468 | 0.879958 | 0.967567 | 1.000000 | 0.894707 | 0.962756 | 0.512342 | 0.380614 | 0.044864 | 0.748006 | 0.642281 | -0.633303 | 0.491743 | 0.508283 | -0.343765 | 0.922991 |
IN_DEGREE_CENTRALITY | 0.907749 | 0.993021 | 0.764474 | 0.894707 | 1.000000 | 0.740622 | 0.395934 | 0.401356 | 0.059710 | 0.873463 | 0.493825 | -0.480728 | 0.611031 | 0.631584 | -0.427157 | 0.905375 |
OUT_DEGREE_CENTRALITY | 0.943530 | 0.722659 | 0.992800 | 0.962756 | 0.740622 | 1.000000 | 0.531070 | 0.329623 | 0.031346 | 0.596529 | 0.667287 | -0.661709 | 0.369877 | 0.382318 | -0.258572 | 0.840450 |
AVG_NEIGHBOR_DEGREE | 0.539215 | 0.424597 | 0.559940 | 0.512342 | 0.395934 | 0.531070 | 1.000000 | -0.110202 | -0.009636 | 0.526910 | 0.076573 | -0.222635 | 0.560895 | 0.577173 | -0.443239 | 0.484496 |
PAGERANK | 0.342109 | 0.352711 | 0.301901 | 0.380614 | 0.401356 | 0.329623 | -0.110202 | 1.000000 | 0.109268 | 0.213201 | 0.603015 | -0.398695 | -0.077618 | -0.073561 | 0.041703 | 0.276802 |
KATZ | 0.030614 | 0.044777 | 0.018554 | 0.044864 | 0.059710 | 0.031346 | -0.009636 | 0.109268 | 1.000000 | 0.024699 | 0.084676 | -0.088603 | -0.013311 | -0.012970 | 0.032814 | 0.003484 |
CLOSENESS_CENTRALITY | 0.779620 | 0.888389 | 0.633813 | 0.748006 | 0.873463 | 0.596529 | 0.526910 | 0.213201 | 0.024699 | 1.000000 | 0.327792 | -0.342888 | 0.804916 | 0.828565 | -0.733591 | 0.785403 |
BETWEENNESS_CENTRALITY | 0.579524 | 0.435932 | 0.614865 | 0.642281 | 0.493825 | 0.667287 | 0.076573 | 0.603015 | 0.084676 | 0.327792 | 1.000000 | -0.577527 | 0.011448 | 0.014592 | -0.088210 | 0.417665 |
CLUSTCOEF | -0.582810 | -0.439187 | -0.617850 | -0.633303 | -0.480728 | -0.661709 | -0.222635 | -0.398695 | -0.088603 | -0.342888 | -0.577527 | 1.000000 | -0.101121 | -0.111978 | 0.185684 | -0.456913 |
NUM_NODES | 0.560734 | 0.675427 | 0.432515 | 0.491743 | 0.611031 | 0.369877 | 0.560895 | -0.077618 | -0.013311 | 0.804916 | 0.011448 | -0.101121 | 1.000000 | 0.969029 | -0.544588 | 0.620465 |
NUM_EDGES | 0.578295 | 0.696580 | 0.446061 | 0.508283 | 0.631584 | 0.382318 | 0.577173 | -0.073561 | -0.012970 | 0.828565 | 0.014592 | -0.111978 | 0.969029 | 1.000000 | -0.603055 | 0.652760 |
AVERAGECLUSTCOEF | -0.366186 | -0.441086 | -0.282453 | -0.343765 | -0.427157 | -0.258572 | -0.443239 | 0.041703 | 0.032814 | -0.733591 | -0.088210 | 0.185684 | -0.544588 | -0.603055 | 1.000000 | -0.359057 |
TRIANGLES | 0.958205 | 0.926553 | 0.884874 | 0.922991 | 0.905375 | 0.840450 | 0.484496 | 0.276802 | 0.003484 | 0.785403 | 0.417665 | -0.456913 | 0.620465 | 0.652760 | -0.359057 | 1.000000 |
netStatsWidePctChange.head()
index | Afghanistan | ... | Yemen, P.D. Rep. | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
variable | AVERAGECLUSTCOEF | CLOSENESS_CENTRALITY | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | ... | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | PAGERANK | TRIANGLES |
PERIOD | |||||||||||||||||||||
1985-02-01 | -0.001313 | 0.000000 | -0.027077 | 0.057692 | 0.057692 | 0.041667 | 0.041667 | -0.090895 | 0.005324 | 0.0 | ... | 0.037767 | 0.115385 | 0.115385 | 0.147059 | 0.147059 | -0.079270 | 0.005324 | 0.0 | 0.061019 | 0.214286 |
1985-03-01 | 0.009252 | 0.012500 | 0.059892 | -0.072727 | -0.072727 | 0.040000 | 0.040000 | 1.068448 | 0.005575 | 0.0 | ... | -0.006298 | -0.051724 | -0.051724 | -0.076923 | -0.076923 | 0.120605 | 0.005575 | 0.0 | -0.253603 | -0.133795 |
1985-04-01 | 0.002814 | 0.008403 | 0.021888 | 0.058824 | 0.058824 | 0.076923 | 0.076923 | 0.021696 | 0.009286 | 0.0 | ... | 0.017743 | 0.036364 | 0.036364 | 0.055556 | 0.055556 | -0.094392 | 0.009286 | 0.0 | 0.052174 | 0.167776 |
1985-05-01 | -0.015002 | -0.028571 | -0.044265 | -0.092593 | -0.092593 | -0.214286 | -0.214286 | 0.053878 | -0.008514 | 0.0 | ... | -0.000867 | -0.052632 | -0.052632 | -0.078947 | -0.078947 | -0.101800 | -0.008514 | 0.0 | 0.216340 | -0.168757 |
1985-06-01 | 0.008185 | 0.020833 | 0.001020 | 0.102041 | 0.102041 | 0.227273 | 0.227273 | -0.238598 | 0.010942 | 0.0 | ... | 0.016146 | 0.055556 | 0.055556 | 0.057143 | 0.057143 | 0.385824 | 0.010942 | 0.0 | 0.090958 | 0.091907 |
5 rows × 1618 columns
netStatsWidePctChange.corr()
index | Afghanistan | ... | Yemen, P.D. Rep. | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
variable | AVERAGECLUSTCOEF | CLOSENESS_CENTRALITY | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | ... | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | PAGERANK | TRIANGLES | |
index | variable | |||||||||||||||||||||
Afghanistan | AVERAGECLUSTCOEF | 1.000000 | 0.007301 | 0.270514 | 0.096660 | 0.112151 | -0.035895 | -0.014978 | -0.020685 | 0.258529 | -0.188801 | ... | 0.084061 | 0.036299 | 0.034930 | 0.024407 | 0.023363 | 0.012110 | 0.307384 | 0.064097 | -0.108507 | 0.017484 |
CLOSENESS_CENTRALITY | 0.007301 | 1.000000 | 0.070596 | 0.485592 | 0.487099 | 0.872270 | 0.877717 | -0.015846 | 0.275702 | 0.015182 | ... | -0.002341 | 0.069233 | 0.070273 | 0.037095 | 0.038194 | 0.031768 | 0.104245 | -0.061700 | -0.049887 | 0.037153 | |
CLUSTCOEF | 0.270514 | 0.070596 | 1.000000 | -0.335108 | -0.322730 | 0.031990 | 0.046749 | -0.085915 | 0.027180 | -0.143344 | ... | 0.039380 | -0.069211 | -0.072786 | -0.018393 | -0.021728 | -0.015853 | 0.062155 | 0.179089 | -0.167428 | -0.039580 | |
DEGREE | 0.096660 | 0.485592 | -0.335108 | 1.000000 | 0.997265 | 0.600478 | 0.602927 | 0.040919 | 0.229174 | 0.036155 | ... | 0.033812 | 0.054782 | 0.057311 | 0.037522 | 0.040003 | -0.035386 | 0.194808 | -0.133062 | 0.028320 | 0.035866 | |
DEGREE_CENTRALITY | 0.112151 | 0.487099 | -0.322730 | 0.997265 | 1.000000 | 0.591543 | 0.599714 | 0.040684 | 0.193812 | -0.037347 | ... | 0.032426 | 0.055273 | 0.057969 | 0.037527 | 0.040160 | -0.035778 | 0.196003 | -0.141562 | 0.027863 | 0.036221 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Yemen, P.D. Rep. | KATZ | 0.012110 | 0.031768 | -0.015853 | -0.035386 | -0.035778 | 0.011603 | 0.011356 | -0.053152 | 0.018031 | 0.004450 | ... | 0.182811 | 0.135961 | 0.135791 | 0.075862 | 0.075829 | 1.000000 | 0.026089 | 0.002197 | -0.000767 | -0.026681 |
NUM_EDGES | 0.307384 | 0.104245 | 0.062155 | 0.194808 | 0.196003 | 0.141730 | 0.143809 | 0.077874 | 0.336449 | -0.007421 | ... | 0.151252 | 0.269836 | 0.269168 | 0.211113 | 0.210682 | 0.026089 | 1.000000 | 0.024942 | -0.054025 | 0.226644 | |
NUM_NODES | 0.064097 | -0.061700 | 0.179089 | -0.133062 | -0.141562 | 0.006909 | -0.001492 | -0.005663 | 0.005411 | 0.112824 | ... | 0.196678 | -0.045196 | -0.064841 | 0.017858 | 0.000134 | 0.002197 | 0.024942 | 1.000000 | 0.047756 | -0.016231 | |
PAGERANK | -0.108507 | -0.049887 | -0.167428 | 0.028320 | 0.027863 | -0.008172 | -0.008795 | 0.031988 | -0.014090 | 0.007996 | ... | 0.029166 | 0.221981 | 0.220902 | 0.306778 | 0.305933 | -0.000767 | -0.054025 | 0.047756 | 1.000000 | 0.325714 | |
TRIANGLES | 0.017484 | 0.037153 | -0.039580 | 0.035866 | 0.036221 | 0.125200 | 0.126826 | 0.189249 | 0.077080 | -0.003790 | ... | 0.316320 | 0.862356 | 0.861799 | 0.908394 | 0.908583 | -0.026681 | 0.226644 | -0.016231 | 0.325714 | 1.000000 |
1618 rows × 1618 columns
tsPctChange.head()
Argentina-Brazil | Argentina-Chile | Argentina-Japan | Australia-Canada | Australia-China | Australia-France | Australia-Germany | Australia-India | Australia-Italy | Australia-Japan | ... | United Kingdom-Singapore | United Kingdom-Spain | United Kingdom-Sweden | United Kingdom-Switzerland | United Kingdom-Taiwan, Province of China | United Kingdom-Tanzania, United Republic of | United Kingdom-Thailand | United Kingdom-Turkey | United Kingdom-United Arab Emirates | United Kingdom-United States | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
period | |||||||||||||||||||||
1985-02-01 | 0.062522 | -0.154539 | -0.293125 | 0.142470 | 0.062065 | -0.097768 | 0.065731 | -0.131695 | 0.000170 | -0.004596 | ... | -0.036131 | -0.029949 | 0.006133 | 0.004249 | -0.030834 | -0.285420 | -0.033476 | -0.129838 | -0.018257 | -0.006231 |
1985-03-01 | -0.157772 | -0.028198 | 0.374053 | 0.018934 | -0.088413 | -0.003325 | -0.037304 | 0.080223 | -0.137057 | -0.023654 | ... | 0.025743 | 0.069935 | 0.006004 | 0.013794 | 0.069704 | 0.261847 | 0.056897 | 0.113487 | 0.076519 | 0.018622 |
1985-04-01 | 0.076867 | 0.130921 | 0.230518 | -0.087285 | 0.164218 | 0.056872 | -0.091518 | 0.128781 | 0.182579 | 0.051857 | ... | 0.035510 | -0.039669 | 0.050269 | 0.063971 | 0.030340 | 0.175368 | 0.097109 | 0.097419 | 0.006345 | 0.040990 |
1985-05-01 | -0.079725 | -0.085492 | -0.072008 | 0.074769 | 0.035272 | -0.029843 | 0.165752 | 0.010152 | 0.004218 | -0.000145 | ... | 0.013411 | -0.008849 | -0.044910 | -0.018795 | 0.022171 | 0.196199 | -0.045813 | -0.046656 | -0.076937 | 0.022249 |
1985-06-01 | 0.037981 | 0.148035 | 0.147914 | 0.027766 | 0.093546 | 0.115701 | -0.122772 | 0.011665 | -0.078161 | 0.012099 | ... | -0.056779 | 0.013070 | 0.005069 | 0.012590 | -0.035781 | -0.138247 | -0.038296 | 0.012301 | 0.063594 | -0.020316 |
5 rows × 640 columns
importers=pd.Series(col.split('-')[0] for col in tsPctChange.columns).unique()
exporters=pd.Series(col.split('-')[1] for col in tsPctChange.columns).unique()
allEcons=sorted(set(list(importers) + list(exporters)))
netStats=pd.Series(col[1] for col in netStatsWidePctChange.columns).nunique()
print('The upper-bound on number of tests:', len(allEcons)*netStats)
The upper-bound on number of tests: 1216
# https://www.kaggle.com/felipefiorini/xgboost-hyper-parameter-tuning
# https://www.kaggle.com/felipefiorini/xgboost-hyper-parameter-tuning/notebook
def hyperParameterTuning(X_train, y_train):
param_tuning = {
'learning_rate': [0.01, 0.1],
'max_depth': [3, 5, 7, 10],
'min_child_weight': [1, 3, 5],
'subsample': [0.5, 0.7],
'colsample_bytree': [0.5, 0.7],
'n_estimators' : [100, 200, 500],
'objective': ['reg:squarederror']
}
xgb_model = xgb.XGBRegressor()
gsearch = GridSearchCV(estimator = xgb_model,
param_grid = param_tuning,
#scoring = 'neg_mean_absolute_error', #MAE
#scoring = 'neg_mean_squared_error', #MSE
cv = 5,
n_jobs = -1,
verbose = 0)
gsearch.fit(X_train,y_train)
return(gsearch.best_params_)
# https://xgboost.readthedocs.io/en/latest/python/examples/index.html
# https://xgboost.readthedocs.io/en/stable/parameter.html
# https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn
# https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning
results={}
econs=pd.Series(col for col in tsPctChange.columns).unique()
tempSeries='Argentina-Brazil'
for tempSeries in tqdm(econs):
# create dataset
# network statistics
X=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries.split('-')[0] or col[0] == tempSeries.split('-')[1]]]
X.columns=["-".join(col) for col in X.columns]
X_temp=X
# bilateral trade series
y=tsPctChange[[tempSeries]]
# if there is data for model
if not X_temp.empty and not y.empty:
results[tempSeries]={}
results[tempSeries]['y_std']=y.std()
results[tempSeries]['series']=tempSeries
X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size=0.1, shuffle=False)
results[tempSeries]['y_test_std']=y_test.std()
bestParams=hyperParameterTuning(X_train, y_train)
bst = xgb.XGBRegressor(
objective = 'reg:squarederror',
colsample_bytree = bestParams['colsample_bytree'],
learning_rate = bestParams['learning_rate'],
max_depth = bestParams['max_depth'],
min_child_weight = bestParams['min_child_weight'],
n_estimators = bestParams['n_estimators'],
subsample = bestParams['subsample'],
nthread=4)
results[tempSeries]['bestParams']=bestParams
bst.fit(X_train, y_train)
results[tempSeries]['model']=bst
y_pred = bst.predict(X_test)
mse=mean_squared_error(y_test, y_pred)
results[tempSeries]['mse']=mse
results[tempSeries]['data']=[X_train, X_test, y_train, y_test, y_pred]
importances=['weight', 'gain', 'cover']
for importance in importances:
results[tempSeries][importance]=(bst.get_booster().get_score(importance_type=importance))
100%|██████████| 640/640 [37:32:46<00:00, 211.20s/it]
params=['learning_rate',
'max_depth',
'min_child_weight',
'subsample',
'colsample_bytree',
'n_estimators',
'objective']
ncols=4
nrows = ceil(len(params) / ncols)
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
for param, ax in zip(params, axes.flatten()):
brParams=pd.DataFrame([results[key]['bestParams'][param] for key in list(results.keys())])
brParams.columns=[param]
counts=pd.DataFrame(brParams[param].value_counts())
ax.barh(counts.index.astype('str'), counts.iloc[:,0])
ax.set_title(param)
plt.tight_layout()
importances=['weight', 'gain', 'cover']
for importance in importances:
plt.figure()
df=pd.DataFrame([results[key][importance] for key in list(results.keys())])
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
fig.suptitle(importance)
nrows = ceil(len(pd.Series(col.split('-')[1] for col in df.columns).unique())/ ncols)
df=df.melt()
df[['econ', 'netStat']] = df['variable'].str.split('-', 1, expand=True)
df
df=df[['netStat', 'value']].pivot(columns='netStat')
for i, col in enumerate(df.columns):
df[[col]].dropna().hist(ax=axes.flatten()[i])
plt.tight_layout()
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
pd.DataFrame([results[key]['mse'] for key in list(results.keys())]).hist(bins=100)
array([[<AxesSubplot:title={'center':'0'}>]], dtype=object)
good=[results[key]['series'] for key in list(results.keys()) if results[key]['mse'] < .0002]
good
['Australia-China', 'Canada-United States', 'Germany-Austria', 'Germany-China', 'Germany-Netherlands', 'Germany-Switzerland', 'Japan-Korea, Republic of', 'Japan-Taiwan, Province of China', 'Korea, Republic of-Japan', 'Korea, Republic of-United States', 'Netherlands-France', 'Netherlands-Germany']
bad=[results[key]['series'] for key in list(results.keys()) if results[key]['mse'] > .05]
bad
['Canada-Trinidad and Tobago', 'Italy-Syrian Arab Republic', 'Italy-Venezuela, Bolivarian Republic', 'Japan-Greece', 'Malta-United Kingdom', 'Switzerland-New Zealand', 'Switzerland-United Kingdom', 'United Kingdom-Iran, Islamic Republic of']
def plotSeries(inputSeries):
for i, series in enumerate(inputSeries):
plt.figure()
modelData=results[inputSeries[i]]['data']
plt.title(results[inputSeries[i]]['series'])
plt.plot(modelData[3].values, 'g')
plt.plot(modelData[4], '*b')
plotSeries(good)
plotSeries(bad)
scatterDF=[]
for key in list(results.keys()):
scatterDF.append(pd.DataFrame(
{'mse':results[key]['mse'],
'y_std':results[key]['y_std']}))
scatterDF=pd.concat(scatterDF)
scatterDF.plot.scatter(x='mse', y='y_std')
plt.title('train y std dev')
Text(0.5, 1.0, 'train y std dev')
scatterDF=[]
for key in list(results.keys()):
scatterDF.append(pd.DataFrame(
{'mse':results[key]['mse'],
'y_std':results[key]['y_test_std']}))
scatterDF=pd.concat(scatterDF)
scatterDF.plot.scatter(x='mse', y='y_std')
plt.title('test y std dev')
Text(0.5, 1.0, 'test y std dev')
It is easier to forecast out of sample on series with lower standard deviation
# https://xgboost.readthedocs.io/en/latest/python/examples/index.html
# https://xgboost.readthedocs.io/en/stable/parameter.html
# https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn
pcaResults={}
econs=pd.Series(col for col in tsPctChange.columns).unique()
for tempSeries in tqdm(econs):
# create dataset
# network statistics
X=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries.split('-')[0]]]
X.columns=[col[1] for col in X.columns]
X_temp=X
# bilateral trade series
y=tsPctChange[[tempSeries]]
# if there is data for model
if not X_temp.empty and not y.empty:
pcaResults[tempSeries]={}
pcaResults[tempSeries]['y_std']=y.std()
pcaResults[tempSeries]['series']=tempSeries
scaler = StandardScaler()
scaledData = pd.DataFrame(scaler.fit_transform(X_temp))
##### PCA
# create model
n_components=4
pcaModel = PCA(n_components=n_components)
# fit model
pcaModelFit = pcaModel.fit(scaledData)
X_temp = pd.DataFrame(pcaModelFit.transform(scaledData), columns=[str(col) for col in range(n_components)])
X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size=0.1, shuffle=False)
pcaResults[tempSeries]['y_test_std']=y_test.std()
bestParams=hyperParameterTuning(X_train, y_train)
pcaResults[tempSeries]['bestParams']=bestParams
bst = xgb.XGBRegressor(
objective = 'reg:squarederror',
colsample_bytree = bestParams['colsample_bytree'],
learning_rate = bestParams['learning_rate'],
max_depth = bestParams['max_depth'],
min_child_weight = bestParams['min_child_weight'],
n_estimators = bestParams['n_estimators'],
subsample = bestParams['subsample'],
nthread=4)
bst.fit(X_train, y_train)
pcaResults[tempSeries]['model']=bst
y_pred = bst.predict(X_test)
mse=mean_squared_error(y_test, y_pred)
pcaResults[tempSeries]['mse']=mse
pcaResults[tempSeries]['data']=[X_train, X_test, y_train, y_test, y_pred]
importances=['weight', 'gain', 'cover']
for importance in importances:
pcaResults[tempSeries][importance]=(bst.get_booster().get_score(importance_type=importance))
100%|██████████| 640/640 [29:39:39<00:00, 166.84s/it]
params=['learning_rate',
'max_depth',
'min_child_weight',
'subsample',
'colsample_bytree',
'n_estimators',
'objective']
ncols=4
nrows = ceil(len(params) / ncols)
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
for param, ax in zip(params, axes.flatten()):
brParams=pd.DataFrame([pcaResults[key]['bestParams'][param] for key in list(pcaResults.keys())])
brParams.columns=[param]
counts=pd.DataFrame(brParams[param].value_counts())
ax.barh(counts.index.astype('str'), counts.iloc[:,0])
ax.set_title(param)
plt.tight_layout()
df=pd.DataFrame([pcaResults[key][importance] for key in list(pcaResults.keys())])
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 95.561462 | 92.752876 | 118.098618 | 116.877548 |
1 | 127.792336 | 115.018021 | 138.342926 | 109.401573 |
2 | 95.904762 | 100.220627 | 116.283112 | 107.361427 |
3 | 160.885559 | 164.918884 | 136.924835 | 143.798813 |
4 | 158.978653 | 162.783722 | 148.036499 | 141.698318 |
... | ... | ... | ... | ... |
619 | 96.807587 | 105.974869 | 97.704742 | 98.943565 |
620 | 133.173447 | 132.842651 | 143.961609 | 115.667351 |
621 | 125.636581 | 120.315910 | 112.648201 | 114.719627 |
622 | 115.836693 | 108.164474 | 125.263817 | 114.302780 |
623 | 138.017899 | 137.224213 | 131.007187 | 145.280807 |
624 rows × 4 columns
importances=['weight', 'gain', 'cover']
for importance in importances:
plt.figure()
df=pd.DataFrame([pcaResults[key][importance] for key in list(pcaResults.keys())])
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
fig.suptitle(importance)
nrows = ceil(len(pd.Series(col for col in df.columns).unique())/ ncols)
for i, col in enumerate(df.columns):
df[[col]].dropna().hist(ax=axes.flatten()[i])
plt.tight_layout()
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
plt.figure()
# pre PCA
pd.DataFrame([results[key]['mse'] for key in list(results.keys())]).hist(bins=100)
# PCA
pd.DataFrame([pcaResults[key]['mse'] for key in list(pcaResults.keys())]).hist(bins=100, color="k")
array([[<AxesSubplot:title={'center':'0'}>]], dtype=object)
<Figure size 432x288 with 0 Axes>
resultsDF=pd.DataFrame([results[key]['mse'] for key in list(results.keys())],
[key for key in list(results.keys())])
resultsDF.columns=['prePCA']
pcaResultsDF=pd.DataFrame([pcaResults[key]['mse'] for key in list(pcaResults.keys())],
[key for key in list(pcaResults.keys())])
pcaResultsDF.columns=['PCA']
resultsDF=resultsDF.join(pcaResultsDF, how='outer')
resultsDF['diff'] = resultsDF['PCA'] - resultsDF['prePCA']
ax=resultsDF['diff'].hist(bins=100)
ax.set_title('Change of Error after PCA')
Text(0.5, 1.0, 'Change of Error after PCA')
improved=resultsDF['diff'].nsmallest()
improved
Austria-Egypt -0.004609 Germany-Bahrain -0.004135 Portugal-Norway -0.003896 Canada-Colombia -0.003849 Japan-Greece -0.003632 Name: diff, dtype: float64
worse=resultsDF['diff'].nlargest()
worse
Spain-Iran, Islamic Republic of 0.007505 Canada-Trinidad and Tobago 0.004254 Italy-Venezuela, Bolivarian Republic 0.003689 United Kingdom-Qatar 0.003213 Philippines-Australia 0.003145 Name: diff, dtype: float64
def plotPCAseries(inputSeries):
for i, series in enumerate(inputSeries):
plt.figure()
pcaModelData=pcaResults[inputSeries[i]]['data']
modelData=results[inputSeries[i]]['data']
plt.title(results[inputSeries[i]]['series'])
plt.plot(pcaModelData[3].values, 'g')
plt.plot(modelData[4], '*b')
plt.plot(pcaModelData[4], '*r')
Red is PCA model forecast, blue is prePCA model forecast, green is actual.
plotPCAseries(improved.index)
plotPCAseries(worse.index)
scatterDF=[]
for key in list(pcaResults.keys()):
scatterDF.append(pd.DataFrame(
{'mse':pcaResults[key]['mse'],
'y_std':pcaResults[key]['y_std']}))
scatterDF=pd.concat(scatterDF)
scatterDF.plot.scatter(x='mse', y='y_std')
plt.title('train y std dev')
Text(0.5, 1.0, 'train y std dev')
scatterDF=[]
for key in list(pcaResults.keys()):
scatterDF.append(pd.DataFrame(
{'mse':pcaResults[key]['mse'],
'y_std':pcaResults[key]['y_test_std']}))
scatterDF=pd.concat(scatterDF)
scatterDF.plot.scatter(x='mse', y='y_std')
plt.title('test y std dev')
Text(0.5, 1.0, 'test y std dev')