import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


from numpy import absolute
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


from tqdm import tqdm

from math import ceil


timeSeries=(pd.read_csv('dotsTimeSeries.csv')
    .pivot_table(index='period', columns=['ReferenceArea', 'CounterpartReferenceArea'], values='value')
)


tsPctChange=np.log(timeSeries).pct_change().iloc[1:].dropna(axis=1)
tsPctChange.columns=['-'.join(col) for col in tsPctChange.columns]
tsPctChange[tsPctChange>1.5]=np.nan
tsPctChange[tsPctChange<-1.5]=np.nan
tsPctChange=tsPctChange.dropna(axis=1)
tsPctChange.index=pd.to_datetime(tsPctChange.index)
tsPctChange=tsPctChange[tsPctChange.index > '1985-01-01']


netStats=pd.read_csv('DOTSnetStats.csv').drop(['Unnamed: 0', 'CONNECTIVITY', 'HAS_BRIDGE', 'TOTAL_NET_VALUE', 'PAGERANK_NUMPY'],axis=1)
netStats.set_index(['index', 'PERIOD'], inplace=True)
# get to period index and econ, stats cols
netStatsWide=(netStats
.reset_index()
.melt(id_vars=['index', 'PERIOD'])
.pivot_table(index='PERIOD', columns=['index', 'variable'], values='value')
)
netStatsWide.index = pd.to_datetime(netStatsWide.index)
netStatsWidePctChange=netStatsWide.pct_change().iloc[1:].dropna(axis=1)
netStatsWidePctChange.index=pd.to_datetime(netStatsWidePctChange.index)
netStatsWidePctChange=netStatsWidePctChange[netStatsWidePctChange.index > '1985-01-01']


# lag the net stats to not leak information
netStatsWidePctChange=netStatsWidePctChange.shift(-1).iloc[:-1]
# take off a period of time series so sizes match
tsPctChange=tsPctChange.iloc[:-1]


netStats.corr()


netStatsWidePctChange.head()


netStatsWidePctChange.corr()


tsPctChange.head()


importers=pd.Series(col.split('-')[0] for col in tsPctChange.columns).unique()
exporters=pd.Series(col.split('-')[1] for col in tsPctChange.columns).unique()
allEcons=sorted(set(list(importers) + list(exporters)))
netStats=pd.Series(col[1] for col in netStatsWidePctChange.columns).nunique()

print('The upper-bound on number of tests:', len(allEcons)*netStats)

The upper-bound on number of tests: 1216


# https://www.kaggle.com/felipefiorini/xgboost-hyper-parameter-tuning
# https://www.kaggle.com/felipefiorini/xgboost-hyper-parameter-tuning/notebook

def hyperParameterTuning(X_train, y_train):
    param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
    }

    xgb_model = xgb.XGBRegressor()

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,
                           #scoring = 'neg_mean_absolute_error', #MAE
                           #scoring = 'neg_mean_squared_error',  #MSE
                           cv = 5,
                           n_jobs = -1,
                           verbose = 0)

    gsearch.fit(X_train,y_train)

    return(gsearch.best_params_)


# https://xgboost.readthedocs.io/en/latest/python/examples/index.html
# https://xgboost.readthedocs.io/en/stable/parameter.html
# https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn

# https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning


results={}

econs=pd.Series(col for col in tsPctChange.columns).unique()
tempSeries='Argentina-Brazil'
for tempSeries in tqdm(econs):
    # create dataset
    # network statistics
    X=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries.split('-')[0] or col[0] == tempSeries.split('-')[1]]]
    X.columns=["-".join(col) for col in X.columns]
    X_temp=X

    # bilateral trade series
    y=tsPctChange[[tempSeries]]

    # if there is data for model
    if not X_temp.empty and not y.empty:
        results[tempSeries]={}
        results[tempSeries]['y_std']=y.std()
        results[tempSeries]['series']=tempSeries
        X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size=0.1, shuffle=False)
        results[tempSeries]['y_test_std']=y_test.std()

        bestParams=hyperParameterTuning(X_train, y_train)

        bst = xgb.XGBRegressor(
            objective = 'reg:squarederror',
            colsample_bytree = bestParams['colsample_bytree'],
            learning_rate = bestParams['learning_rate'],
            max_depth = bestParams['max_depth'],
            min_child_weight = bestParams['min_child_weight'],
            n_estimators = bestParams['n_estimators'],
            subsample = bestParams['subsample'],
            nthread=4)

        results[tempSeries]['bestParams']=bestParams

        bst.fit(X_train, y_train)

        results[tempSeries]['model']=bst

        y_pred = bst.predict(X_test)

        mse=mean_squared_error(y_test, y_pred)
        results[tempSeries]['mse']=mse

        results[tempSeries]['data']=[X_train, X_test, y_train, y_test, y_pred]


        importances=['weight', 'gain', 'cover']
        for importance in importances:
            results[tempSeries][importance]=(bst.get_booster().get_score(importance_type=importance))

100%|██████████| 640/640 [37:32:46<00:00, 211.20s/it]


params=['learning_rate',
        'max_depth',
        'min_child_weight',
        'subsample',
        'colsample_bytree',
        'n_estimators',
        'objective']
ncols=4

nrows = ceil(len(params) / ncols)

width = ncols * 5
length = nrows * 3

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))

for param, ax in zip(params, axes.flatten()):
    brParams=pd.DataFrame([results[key]['bestParams'][param] for key in list(results.keys())])
    brParams.columns=[param]
    counts=pd.DataFrame(brParams[param].value_counts())
    ax.barh(counts.index.astype('str'), counts.iloc[:,0])
    ax.set_title(param)

plt.tight_layout()


importances=['weight', 'gain', 'cover']

for importance in importances:
    plt.figure()

    df=pd.DataFrame([results[key][importance] for key in list(results.keys())])
    width = ncols * 5
    length = nrows * 3

    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
    fig.suptitle(importance)
    nrows = ceil(len(pd.Series(col.split('-')[1] for col in df.columns).unique())/ ncols)

    df=df.melt()
    df[['econ', 'netStat']] = df['variable'].str.split('-', 1, expand=True)
    df
    df=df[['netStat', 'value']].pivot(columns='netStat')
    for i, col in enumerate(df.columns):
        df[[col]].dropna().hist(ax=axes.flatten()[i])


plt.tight_layout()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>


pd.DataFrame([results[key]['mse'] for key in list(results.keys())]).hist(bins=100)

array([[<AxesSubplot:title={'center':'0'}>]], dtype=object)


good=[results[key]['series'] for key in list(results.keys()) if results[key]['mse'] < .0002]
good

['Australia-China',
 'Canada-United States',
 'Germany-Austria',
 'Germany-China',
 'Germany-Netherlands',
 'Germany-Switzerland',
 'Japan-Korea, Republic of',
 'Japan-Taiwan, Province of China',
 'Korea, Republic of-Japan',
 'Korea, Republic of-United States',
 'Netherlands-France',
 'Netherlands-Germany']


bad=[results[key]['series'] for key in list(results.keys()) if results[key]['mse'] > .05]
bad

['Canada-Trinidad and Tobago',
 'Italy-Syrian Arab Republic',
 'Italy-Venezuela, Bolivarian Republic',
 'Japan-Greece',
 'Malta-United Kingdom',
 'Switzerland-New Zealand',
 'Switzerland-United Kingdom',
 'United Kingdom-Iran, Islamic Republic of']


def plotSeries(inputSeries):
    for i, series in enumerate(inputSeries):
        plt.figure()
        modelData=results[inputSeries[i]]['data']

        plt.title(results[inputSeries[i]]['series'])
        plt.plot(modelData[3].values, 'g')
        plt.plot(modelData[4], '*b')


plotSeries(good)


plotSeries(bad)


scatterDF=[]
for key in list(results.keys()):
    scatterDF.append(pd.DataFrame(
    {'mse':results[key]['mse'],
    'y_std':results[key]['y_std']}))

scatterDF=pd.concat(scatterDF)
scatterDF.plot.scatter(x='mse', y='y_std')
plt.title('train y std dev')

Text(0.5, 1.0, 'train y std dev')


scatterDF=[]
for key in list(results.keys()):
    scatterDF.append(pd.DataFrame(
    {'mse':results[key]['mse'],
    'y_std':results[key]['y_test_std']}))

scatterDF=pd.concat(scatterDF)
scatterDF.plot.scatter(x='mse', y='y_std')
plt.title('test y std dev')

Text(0.5, 1.0, 'test y std dev')


# https://xgboost.readthedocs.io/en/latest/python/examples/index.html
# https://xgboost.readthedocs.io/en/stable/parameter.html
# https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn


pcaResults={}

econs=pd.Series(col for col in tsPctChange.columns).unique()
for tempSeries in tqdm(econs):
    # create dataset
    # network statistics
    X=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries.split('-')[0]]]
    X.columns=[col[1] for col in X.columns]
    X_temp=X

    # bilateral trade series
    y=tsPctChange[[tempSeries]]

    # if there is data for model
    if not X_temp.empty and not y.empty:
        pcaResults[tempSeries]={}
        pcaResults[tempSeries]['y_std']=y.std()
        pcaResults[tempSeries]['series']=tempSeries

        scaler = StandardScaler()
        scaledData = pd.DataFrame(scaler.fit_transform(X_temp))

        #####   PCA
        # create model
        n_components=4
        pcaModel = PCA(n_components=n_components)

        # fit model
        pcaModelFit = pcaModel.fit(scaledData)
        X_temp = pd.DataFrame(pcaModelFit.transform(scaledData), columns=[str(col) for col in range(n_components)])

        X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size=0.1, shuffle=False)
        pcaResults[tempSeries]['y_test_std']=y_test.std()

        bestParams=hyperParameterTuning(X_train, y_train)

        pcaResults[tempSeries]['bestParams']=bestParams

        bst = xgb.XGBRegressor(
            objective = 'reg:squarederror',
            colsample_bytree = bestParams['colsample_bytree'],
            learning_rate = bestParams['learning_rate'],
            max_depth = bestParams['max_depth'],
            min_child_weight = bestParams['min_child_weight'],
            n_estimators = bestParams['n_estimators'],
            subsample = bestParams['subsample'],
            nthread=4)

        bst.fit(X_train, y_train)

        pcaResults[tempSeries]['model']=bst

        y_pred = bst.predict(X_test)

        mse=mean_squared_error(y_test, y_pred)
        pcaResults[tempSeries]['mse']=mse

        pcaResults[tempSeries]['data']=[X_train, X_test, y_train, y_test, y_pred]


        importances=['weight', 'gain', 'cover']
        for importance in importances:
            pcaResults[tempSeries][importance]=(bst.get_booster().get_score(importance_type=importance))

100%|██████████| 640/640 [29:39:39<00:00, 166.84s/it]


params=['learning_rate',
        'max_depth',
        'min_child_weight',
        'subsample',
        'colsample_bytree',
        'n_estimators',
        'objective']
ncols=4

nrows = ceil(len(params) / ncols)

width = ncols * 5
length = nrows * 3

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))

for param, ax in zip(params, axes.flatten()):
    brParams=pd.DataFrame([pcaResults[key]['bestParams'][param] for key in list(pcaResults.keys())])
    brParams.columns=[param]
    counts=pd.DataFrame(brParams[param].value_counts())
    ax.barh(counts.index.astype('str'), counts.iloc[:,0])
    ax.set_title(param)

plt.tight_layout()


df=pd.DataFrame([pcaResults[key][importance] for key in list(pcaResults.keys())])
df


importances=['weight', 'gain', 'cover']

for importance in importances:
    plt.figure()

    df=pd.DataFrame([pcaResults[key][importance] for key in list(pcaResults.keys())])
    width = ncols * 5
    length = nrows * 3

    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
    fig.suptitle(importance)
    nrows = ceil(len(pd.Series(col for col in df.columns).unique())/ ncols)

    for i, col in enumerate(df.columns):
        df[[col]].dropna().hist(ax=axes.flatten()[i])


plt.tight_layout()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>


plt.figure()
# pre PCA
pd.DataFrame([results[key]['mse'] for key in list(results.keys())]).hist(bins=100)
# PCA
pd.DataFrame([pcaResults[key]['mse'] for key in list(pcaResults.keys())]).hist(bins=100, color="k")

array([[<AxesSubplot:title={'center':'0'}>]], dtype=object)

<Figure size 432x288 with 0 Axes>


resultsDF=pd.DataFrame([results[key]['mse'] for key in list(results.keys())],
[key for key in list(results.keys())])
resultsDF.columns=['prePCA']

pcaResultsDF=pd.DataFrame([pcaResults[key]['mse'] for key in list(pcaResults.keys())],
[key for key in list(pcaResults.keys())])
pcaResultsDF.columns=['PCA']

resultsDF=resultsDF.join(pcaResultsDF, how='outer')
resultsDF['diff'] = resultsDF['PCA'] - resultsDF['prePCA']
ax=resultsDF['diff'].hist(bins=100)
ax.set_title('Change of Error after PCA')

Text(0.5, 1.0, 'Change of Error after PCA')


improved=resultsDF['diff'].nsmallest()
improved

Austria-Egypt     -0.004609
Germany-Bahrain   -0.004135
Portugal-Norway   -0.003896
Canada-Colombia   -0.003849
Japan-Greece      -0.003632
Name: diff, dtype: float64


worse=resultsDF['diff'].nlargest()
worse

Spain-Iran, Islamic Republic of         0.007505
Canada-Trinidad and Tobago              0.004254
Italy-Venezuela, Bolivarian Republic    0.003689
United Kingdom-Qatar                    0.003213
Philippines-Australia                   0.003145
Name: diff, dtype: float64


def plotPCAseries(inputSeries):
    for i, series in enumerate(inputSeries):
        plt.figure()
        pcaModelData=pcaResults[inputSeries[i]]['data']
        modelData=results[inputSeries[i]]['data']

        plt.title(results[inputSeries[i]]['series'])
        plt.plot(pcaModelData[3].values, 'g')
        plt.plot(modelData[4], '*b')
        plt.plot(pcaModelData[4], '*r')


plotPCAseries(improved.index)


plotPCAseries(worse.index)


scatterDF=[]
for key in list(pcaResults.keys()):
    scatterDF.append(pd.DataFrame(
    {'mse':pcaResults[key]['mse'],
    'y_std':pcaResults[key]['y_std']}))

scatterDF=pd.concat(scatterDF)
scatterDF.plot.scatter(x='mse', y='y_std')
plt.title('train y std dev')

Text(0.5, 1.0, 'train y std dev')


scatterDF=[]
for key in list(pcaResults.keys()):
    scatterDF.append(pd.DataFrame(
    {'mse':pcaResults[key]['mse'],
    'y_std':pcaResults[key]['y_test_std']}))

scatterDF=pd.concat(scatterDF)
scatterDF.plot.scatter(x='mse', y='y_std')
plt.title('test y std dev')

Text(0.5, 1.0, 'test y std dev')

	DEGREE	IN_DEGREE	OUT_DEGREE	DEGREE_CENTRALITY	IN_DEGREE_CENTRALITY	OUT_DEGREE_CENTRALITY	AVG_NEIGHBOR_DEGREE	PAGERANK	KATZ	CLOSENESS_CENTRALITY	BETWEENNESS_CENTRALITY	CLUSTCOEF	NUM_NODES	NUM_EDGES	AVERAGECLUSTCOEF	TRIANGLES
DEGREE	1.000000	0.905947	0.962545	0.992468	0.907749	0.943530	0.539215	0.342109	0.030614	0.779620	0.579524	-0.582810	0.560734	0.578295	-0.366186	0.958205
IN_DEGREE	0.905947	1.000000	0.757224	0.879958	0.993021	0.722659	0.424597	0.352711	0.044777	0.888389	0.435932	-0.439187	0.675427	0.696580	-0.441086	0.926553
OUT_DEGREE	0.962545	0.757224	1.000000	0.967567	0.764474	0.992800	0.559940	0.301901	0.018554	0.633813	0.614865	-0.617850	0.432515	0.446061	-0.282453	0.884874
DEGREE_CENTRALITY	0.992468	0.879958	0.967567	1.000000	0.894707	0.962756	0.512342	0.380614	0.044864	0.748006	0.642281	-0.633303	0.491743	0.508283	-0.343765	0.922991
IN_DEGREE_CENTRALITY	0.907749	0.993021	0.764474	0.894707	1.000000	0.740622	0.395934	0.401356	0.059710	0.873463	0.493825	-0.480728	0.611031	0.631584	-0.427157	0.905375
OUT_DEGREE_CENTRALITY	0.943530	0.722659	0.992800	0.962756	0.740622	1.000000	0.531070	0.329623	0.031346	0.596529	0.667287	-0.661709	0.369877	0.382318	-0.258572	0.840450
AVG_NEIGHBOR_DEGREE	0.539215	0.424597	0.559940	0.512342	0.395934	0.531070	1.000000	-0.110202	-0.009636	0.526910	0.076573	-0.222635	0.560895	0.577173	-0.443239	0.484496
PAGERANK	0.342109	0.352711	0.301901	0.380614	0.401356	0.329623	-0.110202	1.000000	0.109268	0.213201	0.603015	-0.398695	-0.077618	-0.073561	0.041703	0.276802
KATZ	0.030614	0.044777	0.018554	0.044864	0.059710	0.031346	-0.009636	0.109268	1.000000	0.024699	0.084676	-0.088603	-0.013311	-0.012970	0.032814	0.003484
CLOSENESS_CENTRALITY	0.779620	0.888389	0.633813	0.748006	0.873463	0.596529	0.526910	0.213201	0.024699	1.000000	0.327792	-0.342888	0.804916	0.828565	-0.733591	0.785403
BETWEENNESS_CENTRALITY	0.579524	0.435932	0.614865	0.642281	0.493825	0.667287	0.076573	0.603015	0.084676	0.327792	1.000000	-0.577527	0.011448	0.014592	-0.088210	0.417665
CLUSTCOEF	-0.582810	-0.439187	-0.617850	-0.633303	-0.480728	-0.661709	-0.222635	-0.398695	-0.088603	-0.342888	-0.577527	1.000000	-0.101121	-0.111978	0.185684	-0.456913
NUM_NODES	0.560734	0.675427	0.432515	0.491743	0.611031	0.369877	0.560895	-0.077618	-0.013311	0.804916	0.011448	-0.101121	1.000000	0.969029	-0.544588	0.620465
NUM_EDGES	0.578295	0.696580	0.446061	0.508283	0.631584	0.382318	0.577173	-0.073561	-0.012970	0.828565	0.014592	-0.111978	0.969029	1.000000	-0.603055	0.652760
AVERAGECLUSTCOEF	-0.366186	-0.441086	-0.282453	-0.343765	-0.427157	-0.258572	-0.443239	0.041703	0.032814	-0.733591	-0.088210	0.185684	-0.544588	-0.603055	1.000000	-0.359057
TRIANGLES	0.958205	0.926553	0.884874	0.922991	0.905375	0.840450	0.484496	0.276802	0.003484	0.785403	0.417665	-0.456913	0.620465	0.652760	-0.359057	1.000000

index	Afghanistan										...	Yemen, P.D. Rep.
variable	AVERAGECLUSTCOEF	CLOSENESS_CENTRALITY	CLUSTCOEF	DEGREE	DEGREE_CENTRALITY	IN_DEGREE	IN_DEGREE_CENTRALITY	KATZ	NUM_EDGES	NUM_NODES	...	CLUSTCOEF	DEGREE	DEGREE_CENTRALITY	IN_DEGREE	IN_DEGREE_CENTRALITY	KATZ	NUM_EDGES	NUM_NODES	PAGERANK	TRIANGLES
PERIOD
1985-02-01	-0.001313	0.000000	-0.027077	0.057692	0.057692	0.041667	0.041667	-0.090895	0.005324	0.0	...	0.037767	0.115385	0.115385	0.147059	0.147059	-0.079270	0.005324	0.0	0.061019	0.214286
1985-03-01	0.009252	0.012500	0.059892	-0.072727	-0.072727	0.040000	0.040000	1.068448	0.005575	0.0	...	-0.006298	-0.051724	-0.051724	-0.076923	-0.076923	0.120605	0.005575	0.0	-0.253603	-0.133795
1985-04-01	0.002814	0.008403	0.021888	0.058824	0.058824	0.076923	0.076923	0.021696	0.009286	0.0	...	0.017743	0.036364	0.036364	0.055556	0.055556	-0.094392	0.009286	0.0	0.052174	0.167776
1985-05-01	-0.015002	-0.028571	-0.044265	-0.092593	-0.092593	-0.214286	-0.214286	0.053878	-0.008514	0.0	...	-0.000867	-0.052632	-0.052632	-0.078947	-0.078947	-0.101800	-0.008514	0.0	0.216340	-0.168757
1985-06-01	0.008185	0.020833	0.001020	0.102041	0.102041	0.227273	0.227273	-0.238598	0.010942	0.0	...	0.016146	0.055556	0.055556	0.057143	0.057143	0.385824	0.010942	0.0	0.090958	0.091907

	index	Afghanistan										...	Yemen, P.D. Rep.
	variable	AVERAGECLUSTCOEF	CLOSENESS_CENTRALITY	CLUSTCOEF	DEGREE	DEGREE_CENTRALITY	IN_DEGREE	IN_DEGREE_CENTRALITY	KATZ	NUM_EDGES	NUM_NODES	...	CLUSTCOEF	DEGREE	DEGREE_CENTRALITY	IN_DEGREE	IN_DEGREE_CENTRALITY	KATZ	NUM_EDGES	NUM_NODES	PAGERANK	TRIANGLES
index	variable
Afghanistan	AVERAGECLUSTCOEF	1.000000	0.007301	0.270514	0.096660	0.112151	-0.035895	-0.014978	-0.020685	0.258529	-0.188801	...	0.084061	0.036299	0.034930	0.024407	0.023363	0.012110	0.307384	0.064097	-0.108507	0.017484
	CLOSENESS_CENTRALITY	0.007301	1.000000	0.070596	0.485592	0.487099	0.872270	0.877717	-0.015846	0.275702	0.015182	...	-0.002341	0.069233	0.070273	0.037095	0.038194	0.031768	0.104245	-0.061700	-0.049887	0.037153
	CLUSTCOEF	0.270514	0.070596	1.000000	-0.335108	-0.322730	0.031990	0.046749	-0.085915	0.027180	-0.143344	...	0.039380	-0.069211	-0.072786	-0.018393	-0.021728	-0.015853	0.062155	0.179089	-0.167428	-0.039580
	DEGREE	0.096660	0.485592	-0.335108	1.000000	0.997265	0.600478	0.602927	0.040919	0.229174	0.036155	...	0.033812	0.054782	0.057311	0.037522	0.040003	-0.035386	0.194808	-0.133062	0.028320	0.035866
	DEGREE_CENTRALITY	0.112151	0.487099	-0.322730	0.997265	1.000000	0.591543	0.599714	0.040684	0.193812	-0.037347	...	0.032426	0.055273	0.057969	0.037527	0.040160	-0.035778	0.196003	-0.141562	0.027863	0.036221
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Yemen, P.D. Rep.	KATZ	0.012110	0.031768	-0.015853	-0.035386	-0.035778	0.011603	0.011356	-0.053152	0.018031	0.004450	...	0.182811	0.135961	0.135791	0.075862	0.075829	1.000000	0.026089	0.002197	-0.000767	-0.026681
	NUM_EDGES	0.307384	0.104245	0.062155	0.194808	0.196003	0.141730	0.143809	0.077874	0.336449	-0.007421	...	0.151252	0.269836	0.269168	0.211113	0.210682	0.026089	1.000000	0.024942	-0.054025	0.226644
	NUM_NODES	0.064097	-0.061700	0.179089	-0.133062	-0.141562	0.006909	-0.001492	-0.005663	0.005411	0.112824	...	0.196678	-0.045196	-0.064841	0.017858	0.000134	0.002197	0.024942	1.000000	0.047756	-0.016231
	PAGERANK	-0.108507	-0.049887	-0.167428	0.028320	0.027863	-0.008172	-0.008795	0.031988	-0.014090	0.007996	...	0.029166	0.221981	0.220902	0.306778	0.305933	-0.000767	-0.054025	0.047756	1.000000	0.325714
	TRIANGLES	0.017484	0.037153	-0.039580	0.035866	0.036221	0.125200	0.126826	0.189249	0.077080	-0.003790	...	0.316320	0.862356	0.861799	0.908394	0.908583	-0.026681	0.226644	-0.016231	0.325714	1.000000

	Argentina-Brazil	Argentina-Chile	Argentina-Japan	Australia-Canada	Australia-China	Australia-France	Australia-Germany	Australia-India	Australia-Italy	Australia-Japan	...	United Kingdom-Singapore	United Kingdom-Spain	United Kingdom-Sweden	United Kingdom-Switzerland	United Kingdom-Taiwan, Province of China	United Kingdom-Tanzania, United Republic of	United Kingdom-Thailand	United Kingdom-Turkey	United Kingdom-United Arab Emirates	United Kingdom-United States
period
1985-02-01	0.062522	-0.154539	-0.293125	0.142470	0.062065	-0.097768	0.065731	-0.131695	0.000170	-0.004596	...	-0.036131	-0.029949	0.006133	0.004249	-0.030834	-0.285420	-0.033476	-0.129838	-0.018257	-0.006231
1985-03-01	-0.157772	-0.028198	0.374053	0.018934	-0.088413	-0.003325	-0.037304	0.080223	-0.137057	-0.023654	...	0.025743	0.069935	0.006004	0.013794	0.069704	0.261847	0.056897	0.113487	0.076519	0.018622
1985-04-01	0.076867	0.130921	0.230518	-0.087285	0.164218	0.056872	-0.091518	0.128781	0.182579	0.051857	...	0.035510	-0.039669	0.050269	0.063971	0.030340	0.175368	0.097109	0.097419	0.006345	0.040990
1985-05-01	-0.079725	-0.085492	-0.072008	0.074769	0.035272	-0.029843	0.165752	0.010152	0.004218	-0.000145	...	0.013411	-0.008849	-0.044910	-0.018795	0.022171	0.196199	-0.045813	-0.046656	-0.076937	0.022249
1985-06-01	0.037981	0.148035	0.147914	0.027766	0.093546	0.115701	-0.122772	0.011665	-0.078161	0.012099	...	-0.056779	0.013070	0.005069	0.012590	-0.035781	-0.138247	-0.038296	0.012301	0.063594	-0.020316

	0	1	2	3
0	95.561462	92.752876	118.098618	116.877548
1	127.792336	115.018021	138.342926	109.401573
2	95.904762	100.220627	116.283112	107.361427
3	160.885559	164.918884	136.924835	143.798813
4	158.978653	162.783722	148.036499	141.698318
...	...	...	...	...
619	96.807587	105.974869	97.704742	98.943565
620	133.173447	132.842651	143.961609	115.667351
621	125.636581	120.315910	112.648201	114.719627
622	115.836693	108.164474	125.263817	114.302780
623	138.017899	137.224213	131.007187	145.280807

1. Load and clean data¶

2. Loop and XGBoost¶

5. PCA on Network Statistics to Reduce Dimensionality¶