Feature Engineering Trade Network Statistics
-
-
In previous work I've calculated some basic network statistics on IMF Direction of Trade Statistics (DOTS) export data.
In this notebook I'll see if this data has any relationship with percent change bilateral export series. TLDR: currently no linear relationships
Table of Contents:
Collapse network statistics with PCA, repeat 2,3,4 on PCA series
improvements / future work:
validate feature importance
use univariate non-linear models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
timeSeries=(pd.read_csv('dotsTimeSeries.csv')
.pivot_table(index='period', columns=['ReferenceArea', 'CounterpartReferenceArea'], values='value')
)
tsPctChange=np.log(timeSeries).pct_change().iloc[1:].dropna(axis=1)
tsPctChange.columns=['-'.join(col) for col in tsPctChange.columns]
tsPctChange[tsPctChange>1.5]=np.nan
tsPctChange[tsPctChange<-1.5]=np.nan
tsPctChange=tsPctChange.dropna(axis=1)
tsPctChange.index=pd.to_datetime(tsPctChange.index)
tsPctChange=tsPctChange[tsPctChange.index > '1985-01-01']
netStats=pd.read_csv('DOTSnetStats.csv').drop(['Unnamed: 0', 'CONNECTIVITY', 'HAS_BRIDGE', 'TOTAL_NET_VALUE', 'PAGERANK_NUMPY'],axis=1)
netStats.set_index(['index', 'PERIOD'], inplace=True)
# get to period index and econ, stats cols
netStatsWide=(netStats
.reset_index()
.melt(id_vars=['index', 'PERIOD'])
.pivot_table(index='PERIOD', columns=['index', 'variable'], values='value')
)
netStatsWide.index = pd.to_datetime(netStatsWide.index)
netStatsWidePctChange=netStatsWide.pct_change().iloc[1:].dropna(axis=1)
netStatsWidePctChange.index=pd.to_datetime(netStatsWidePctChange.index)
netStatsWidePctChange=netStatsWidePctChange[netStatsWidePctChange.index > '1985-01-01']
netStats.corr()
DEGREE | IN_DEGREE | OUT_DEGREE | DEGREE_CENTRALITY | IN_DEGREE_CENTRALITY | OUT_DEGREE_CENTRALITY | AVG_NEIGHBOR_DEGREE | PAGERANK | KATZ | CLOSENESS_CENTRALITY | BETWEENNESS_CENTRALITY | CLUSTCOEF | NUM_NODES | NUM_EDGES | AVERAGECLUSTCOEF | TRIANGLES | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
DEGREE | 1.000000 | 0.905947 | 0.962545 | 0.992468 | 0.907749 | 0.943530 | 0.539215 | 0.342109 | 0.030614 | 0.779620 | 0.579524 | -0.582810 | 0.560734 | 0.578295 | -0.366186 | 0.958205 |
IN_DEGREE | 0.905947 | 1.000000 | 0.757224 | 0.879958 | 0.993021 | 0.722659 | 0.424597 | 0.352711 | 0.044777 | 0.888389 | 0.435932 | -0.439187 | 0.675427 | 0.696580 | -0.441086 | 0.926553 |
OUT_DEGREE | 0.962545 | 0.757224 | 1.000000 | 0.967567 | 0.764474 | 0.992800 | 0.559940 | 0.301901 | 0.018554 | 0.633813 | 0.614865 | -0.617850 | 0.432515 | 0.446061 | -0.282453 | 0.884874 |
DEGREE_CENTRALITY | 0.992468 | 0.879958 | 0.967567 | 1.000000 | 0.894707 | 0.962756 | 0.512342 | 0.380614 | 0.044864 | 0.748006 | 0.642281 | -0.633303 | 0.491743 | 0.508283 | -0.343765 | 0.922991 |
IN_DEGREE_CENTRALITY | 0.907749 | 0.993021 | 0.764474 | 0.894707 | 1.000000 | 0.740622 | 0.395934 | 0.401356 | 0.059710 | 0.873463 | 0.493825 | -0.480728 | 0.611031 | 0.631584 | -0.427157 | 0.905375 |
OUT_DEGREE_CENTRALITY | 0.943530 | 0.722659 | 0.992800 | 0.962756 | 0.740622 | 1.000000 | 0.531070 | 0.329623 | 0.031346 | 0.596529 | 0.667287 | -0.661709 | 0.369877 | 0.382318 | -0.258572 | 0.840450 |
AVG_NEIGHBOR_DEGREE | 0.539215 | 0.424597 | 0.559940 | 0.512342 | 0.395934 | 0.531070 | 1.000000 | -0.110202 | -0.009636 | 0.526910 | 0.076573 | -0.222635 | 0.560895 | 0.577173 | -0.443239 | 0.484496 |
PAGERANK | 0.342109 | 0.352711 | 0.301901 | 0.380614 | 0.401356 | 0.329623 | -0.110202 | 1.000000 | 0.109268 | 0.213201 | 0.603015 | -0.398695 | -0.077618 | -0.073561 | 0.041703 | 0.276802 |
KATZ | 0.030614 | 0.044777 | 0.018554 | 0.044864 | 0.059710 | 0.031346 | -0.009636 | 0.109268 | 1.000000 | 0.024699 | 0.084676 | -0.088603 | -0.013311 | -0.012970 | 0.032814 | 0.003484 |
CLOSENESS_CENTRALITY | 0.779620 | 0.888389 | 0.633813 | 0.748006 | 0.873463 | 0.596529 | 0.526910 | 0.213201 | 0.024699 | 1.000000 | 0.327792 | -0.342888 | 0.804916 | 0.828565 | -0.733591 | 0.785403 |
BETWEENNESS_CENTRALITY | 0.579524 | 0.435932 | 0.614865 | 0.642281 | 0.493825 | 0.667287 | 0.076573 | 0.603015 | 0.084676 | 0.327792 | 1.000000 | -0.577527 | 0.011448 | 0.014592 | -0.088210 | 0.417665 |
CLUSTCOEF | -0.582810 | -0.439187 | -0.617850 | -0.633303 | -0.480728 | -0.661709 | -0.222635 | -0.398695 | -0.088603 | -0.342888 | -0.577527 | 1.000000 | -0.101121 | -0.111978 | 0.185684 | -0.456913 |
NUM_NODES | 0.560734 | 0.675427 | 0.432515 | 0.491743 | 0.611031 | 0.369877 | 0.560895 | -0.077618 | -0.013311 | 0.804916 | 0.011448 | -0.101121 | 1.000000 | 0.969029 | -0.544588 | 0.620465 |
NUM_EDGES | 0.578295 | 0.696580 | 0.446061 | 0.508283 | 0.631584 | 0.382318 | 0.577173 | -0.073561 | -0.012970 | 0.828565 | 0.014592 | -0.111978 | 0.969029 | 1.000000 | -0.603055 | 0.652760 |
AVERAGECLUSTCOEF | -0.366186 | -0.441086 | -0.282453 | -0.343765 | -0.427157 | -0.258572 | -0.443239 | 0.041703 | 0.032814 | -0.733591 | -0.088210 | 0.185684 | -0.544588 | -0.603055 | 1.000000 | -0.359057 |
TRIANGLES | 0.958205 | 0.926553 | 0.884874 | 0.922991 | 0.905375 | 0.840450 | 0.484496 | 0.276802 | 0.003484 | 0.785403 | 0.417665 | -0.456913 | 0.620465 | 0.652760 | -0.359057 | 1.000000 |
netStatsWidePctChange.head()
index | Afghanistan | ... | Yemen, P.D. Rep. | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
variable | AVERAGECLUSTCOEF | CLOSENESS_CENTRALITY | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | ... | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | PAGERANK | TRIANGLES |
PERIOD | |||||||||||||||||||||
1985-02-01 | 0.006149 | 0.008230 | -0.017379 | 0.019608 | 0.019608 | 0.043478 | 0.043478 | -0.304667 | 0.002810 | 0.0 | ... | -0.062441 | -0.018868 | -0.018868 | -0.028571 | -0.028571 | 0.017828 | 0.002810 | 0.0 | -0.066568 | 0.024390 |
1985-03-01 | -0.001313 | 0.000000 | -0.027077 | 0.057692 | 0.057692 | 0.041667 | 0.041667 | -0.090895 | 0.005324 | 0.0 | ... | 0.037767 | 0.115385 | 0.115385 | 0.147059 | 0.147059 | -0.079270 | 0.005324 | 0.0 | 0.061019 | 0.214286 |
1985-04-01 | 0.009252 | 0.012500 | 0.059892 | -0.072727 | -0.072727 | 0.040000 | 0.040000 | 1.068448 | 0.005575 | 0.0 | ... | -0.006298 | -0.051724 | -0.051724 | -0.076923 | -0.076923 | 0.120605 | 0.005575 | 0.0 | -0.253603 | -0.133795 |
1985-05-01 | 0.002814 | 0.008403 | 0.021888 | 0.058824 | 0.058824 | 0.076923 | 0.076923 | 0.021696 | 0.009286 | 0.0 | ... | 0.017743 | 0.036364 | 0.036364 | 0.055556 | 0.055556 | -0.094392 | 0.009286 | 0.0 | 0.052174 | 0.167776 |
1985-06-01 | -0.015002 | -0.028571 | -0.044265 | -0.092593 | -0.092593 | -0.214286 | -0.214286 | 0.053878 | -0.008514 | 0.0 | ... | -0.000867 | -0.052632 | -0.052632 | -0.078947 | -0.078947 | -0.101800 | -0.008514 | 0.0 | 0.216340 | -0.168757 |
5 rows × 1618 columns
netStatsWidePctChange.corr()
index | Afghanistan | ... | Yemen, P.D. Rep. | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
variable | AVERAGECLUSTCOEF | CLOSENESS_CENTRALITY | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | ... | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | PAGERANK | TRIANGLES | |
index | variable | |||||||||||||||||||||
Afghanistan | AVERAGECLUSTCOEF | 1.000000 | 0.008563 | 0.268401 | 0.097012 | 0.112500 | -0.034662 | -0.013745 | -0.020592 | 0.258409 | -0.188789 | ... | 0.069385 | 0.034931 | 0.033566 | 0.022539 | 0.021497 | 0.012239 | 0.308246 | 0.063992 | -0.113081 | 0.018248 |
CLOSENESS_CENTRALITY | 0.008563 | 1.000000 | 0.069392 | 0.485661 | 0.487173 | 0.872371 | 0.877817 | -0.015789 | 0.275666 | 0.015062 | ... | -0.010067 | 0.068281 | 0.069322 | 0.035814 | 0.036912 | 0.031849 | 0.105024 | -0.061704 | -0.053218 | 0.037660 | |
CLUSTCOEF | 0.268401 | 0.069392 | 1.000000 | -0.335240 | -0.322883 | 0.030909 | 0.045627 | -0.085917 | 0.027050 | -0.143091 | ... | 0.048122 | -0.067967 | -0.071540 | -0.016755 | -0.020086 | -0.015965 | 0.060949 | 0.178997 | -0.161005 | -0.040235 | |
DEGREE | 0.097012 | 0.485661 | -0.335240 | 1.000000 | 0.997265 | 0.600504 | 0.602948 | 0.040934 | 0.229190 | 0.036113 | ... | 0.029834 | 0.054439 | 0.056968 | 0.037052 | 0.039531 | -0.035350 | 0.195018 | -0.133065 | 0.026756 | 0.036045 | |
DEGREE_CENTRALITY | 0.112500 | 0.487173 | -0.322883 | 0.997265 | 1.000000 | 0.591579 | 0.599743 | 0.040700 | 0.193831 | -0.037386 | ... | 0.028419 | 0.054921 | 0.057617 | 0.037045 | 0.039675 | -0.035741 | 0.196222 | -0.141565 | 0.026265 | 0.036405 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Yemen, P.D. Rep. | KATZ | 0.012239 | 0.031849 | -0.015965 | -0.035350 | -0.035741 | 0.011687 | 0.011442 | -0.053146 | 0.018041 | 0.004437 | ... | 0.175340 | 0.135799 | 0.135630 | 0.075658 | 0.075625 | 1.000000 | 0.026173 | 0.002194 | -0.001177 | -0.026618 |
NUM_EDGES | 0.308246 | 0.105024 | 0.060949 | 0.195018 | 0.196222 | 0.142425 | 0.144518 | 0.077890 | 0.336385 | -0.007532 | ... | 0.137823 | 0.268694 | 0.268028 | 0.209588 | 0.209158 | 0.026173 | 1.000000 | 0.024900 | -0.057362 | 0.227043 | |
NUM_NODES | 0.063992 | -0.061704 | 0.178997 | -0.133065 | -0.141565 | 0.006879 | -0.001520 | -0.005665 | 0.005408 | 0.112827 | ... | 0.189844 | -0.045144 | -0.064780 | 0.017885 | 0.000177 | 0.002194 | 0.024900 | 1.000000 | 0.047493 | -0.016247 | |
PAGERANK | -0.113081 | -0.053218 | -0.161005 | 0.026756 | 0.026265 | -0.011613 | -0.012307 | 0.031506 | -0.014342 | 0.008425 | ... | 0.062099 | 0.223940 | 0.222865 | 0.309366 | 0.308527 | -0.001177 | -0.057362 | 0.047493 | 1.000000 | 0.320644 | |
TRIANGLES | 0.018248 | 0.037660 | -0.040235 | 0.036045 | 0.036405 | 0.125626 | 0.127260 | 0.189249 | 0.077119 | -0.003858 | ... | 0.300027 | 0.861270 | 0.860715 | 0.906685 | 0.906874 | -0.026618 | 0.227043 | -0.016247 | 0.320644 | 1.000000 |
1618 rows × 1618 columns
tsPctChange.head()
Argentina-Brazil | Argentina-Chile | Argentina-Japan | Australia-Canada | Australia-China | Australia-France | Australia-Germany | Australia-India | Australia-Italy | Australia-Japan | ... | United Kingdom-Singapore | United Kingdom-Spain | United Kingdom-Sweden | United Kingdom-Switzerland | United Kingdom-Taiwan, Province of China | United Kingdom-Tanzania, United Republic of | United Kingdom-Thailand | United Kingdom-Turkey | United Kingdom-United Arab Emirates | United Kingdom-United States | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
period | |||||||||||||||||||||
1985-02-01 | 0.062522 | -0.154539 | -0.293125 | 0.142470 | 0.062065 | -0.097768 | 0.065731 | -0.131695 | 0.000170 | -0.004596 | ... | -0.036131 | -0.029949 | 0.006133 | 0.004249 | -0.030834 | -0.285420 | -0.033476 | -0.129838 | -0.018257 | -0.006231 |
1985-03-01 | -0.157772 | -0.028198 | 0.374053 | 0.018934 | -0.088413 | -0.003325 | -0.037304 | 0.080223 | -0.137057 | -0.023654 | ... | 0.025743 | 0.069935 | 0.006004 | 0.013794 | 0.069704 | 0.261847 | 0.056897 | 0.113487 | 0.076519 | 0.018622 |
1985-04-01 | 0.076867 | 0.130921 | 0.230518 | -0.087285 | 0.164218 | 0.056872 | -0.091518 | 0.128781 | 0.182579 | 0.051857 | ... | 0.035510 | -0.039669 | 0.050269 | 0.063971 | 0.030340 | 0.175368 | 0.097109 | 0.097419 | 0.006345 | 0.040990 |
1985-05-01 | -0.079725 | -0.085492 | -0.072008 | 0.074769 | 0.035272 | -0.029843 | 0.165752 | 0.010152 | 0.004218 | -0.000145 | ... | 0.013411 | -0.008849 | -0.044910 | -0.018795 | 0.022171 | 0.196199 | -0.045813 | -0.046656 | -0.076937 | 0.022249 |
1985-06-01 | 0.037981 | 0.148035 | 0.147914 | 0.027766 | 0.093546 | 0.115701 | -0.122772 | 0.011665 | -0.078161 | 0.012099 | ... | -0.056779 | 0.013070 | 0.005069 | 0.012590 | -0.035781 | -0.138247 | -0.038296 | 0.012301 | 0.063594 | -0.020316 |
5 rows × 640 columns
importers=pd.Series(col.split('-')[0] for col in tsPctChange.columns).unique()
exporters=pd.Series(col.split('-')[1] for col in tsPctChange.columns).unique()
allEcons=sorted(set(list(importers) + list(exporters)))
netStats=pd.Series(col[1] for col in netStatsWidePctChange.columns).nunique()
print('The upper-bound on number of tests:', len(allEcons)*netStats)
The upper-bound on number of tests: 1216
econs=pd.Series(col for col in tsPctChange.columns).unique()
regResults=[]
for tempSeries in econs:
# get exporter network data
# if country in net stats equals [0] <- exporter, [1] <- importers
X_econ=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries.split('-')[0]]]
# network statistics availiable to exporter
allNs=[col[1] for col in X_econ.columns]
X_econ.columns=allNs
# trade import series
y=tsPctChange[[tempSeries]]
y.columns = ['_'.join(col) for col in y.columns]
for tempNs in allNs:
X = X_econ[tempNs]
X = sm.add_constant(X, has_constant='add')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
reg = sm.OLS(y_train, X_train).fit()
y_pred = reg.predict(X_test)
tempReturn=(pd.DataFrame({
'ns':reg.params.index[1],
'coef':reg.params[1],
'pvalue':reg.pvalues[1],
'r2':reg.rsquared,
'aic':reg.aic,
'mse':mean_squared_error(y_test, y_pred)},index=[tempSeries])
)
regResults.append(tempReturn)
regResults=pd.concat(regResults)
regResults.reset_index(inplace=True)
regResults[regResults.index.isin(regResults['pvalue'].nsmallest().index)]
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
1858 | France-Italy | PAGERANK | 0.366934 | 3.058292e-72 | 0.560019 | -1923.270376 | 0.000415 |
4186 | Italy-Austria | PAGERANK | 0.294486 | 1.078482e-65 | 0.525103 | -1723.383263 | 0.000481 |
4330 | Italy-France | PAGERANK | 0.281747 | 1.283553e-79 | 0.596303 | -1873.084271 | 0.000485 |
4346 | Italy-Germany | PAGERANK | 0.219648 | 5.352173e-74 | 0.568945 | -2025.713641 | 0.000250 |
4762 | Italy-Spain | PAGERANK | 0.332262 | 3.005676e-61 | 0.499840 | -1587.743934 | 0.000609 |
regResults[regResults.index.isin(regResults['r2'].nlargest().index)]
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
1858 | France-Italy | PAGERANK | 0.366934 | 3.058292e-72 | 0.560019 | -1923.270376 | 0.000415 |
4186 | Italy-Austria | PAGERANK | 0.294486 | 1.078482e-65 | 0.525103 | -1723.383263 | 0.000481 |
4330 | Italy-France | PAGERANK | 0.281747 | 1.283553e-79 | 0.596303 | -1873.084271 | 0.000485 |
4346 | Italy-Germany | PAGERANK | 0.219648 | 5.352173e-74 | 0.568945 | -2025.713641 | 0.000250 |
4762 | Italy-Spain | PAGERANK | 0.332262 | 3.005676e-61 | 0.499840 | -1587.743934 | 0.000609 |
regResults[regResults.index.isin(regResults['aic'].nsmallest().index)]
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
885 | Canada-United States | DEGREE | 0.098988 | 1.032284e-08 | 0.079926 | -2663.455321 | 0.000103 |
886 | Canada-United States | DEGREE_CENTRALITY | 0.091503 | 6.636601e-08 | 0.071443 | -2659.820961 | 0.000104 |
890 | Canada-United States | NUM_EDGES | 0.254925 | 9.073834e-10 | 0.090931 | -2668.220270 | 0.000076 |
892 | Canada-United States | OUT_DEGREE | 0.067155 | 7.640546e-07 | 0.060235 | -2655.069445 | 0.000106 |
895 | Canada-United States | TRIANGLES | 0.091143 | 5.187018e-09 | 0.083051 | -2664.802277 | 0.000095 |
regResults[regResults.index.isin(regResults['mse'].nsmallest().index)]
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
6088 | Netherlands-Germany | AVERAGECLUSTCOEF | 0.328054 | 6.201868e-05 | 0.039943 | -2443.170822 | 0.000059 |
6089 | Netherlands-Germany | AVG_NEIGHBOR_DEGREE | 0.163929 | 7.750331e-05 | 0.038913 | -2442.746399 | 0.000059 |
6093 | Netherlands-Germany | DEGREE | 0.096945 | 9.414008e-03 | 0.016989 | -2433.814378 | 0.000067 |
6098 | Netherlands-Germany | NUM_EDGES | 0.365567 | 8.341191e-12 | 0.111869 | -2474.008866 | 0.000045 |
6103 | Netherlands-Germany | TRIANGLES | 0.253912 | 5.776218e-09 | 0.082563 | -2461.152722 | 0.000053 |
filteredRegResults=regResults.query('pvalue<0.05 and r2>.5')
filteredRegResults.reset_index(drop=True, inplace=True)
filteredRegResults
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
0 | France-Italy | PAGERANK | 0.366934 | 3.058292e-72 | 0.560019 | -1923.270376 | 0.000415 |
1 | Italy-Austria | PAGERANK | 0.294486 | 1.078482e-65 | 0.525103 | -1723.383263 | 0.000481 |
2 | Italy-France | PAGERANK | 0.281747 | 1.283553e-79 | 0.596303 | -1873.084271 | 0.000485 |
3 | Italy-Germany | PAGERANK | 0.219648 | 5.352173e-74 | 0.568945 | -2025.713641 | 0.000250 |
Let's do a visual check of the series that came back with any remote form of a linear relationship.
We can see that many of the relationships are affected by outliers so these numbers are misleading.
from math import ceil
ncols=4
nrows = ceil(filteredRegResults.shape[0] / ncols)
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
for i, ax in enumerate(axes.flatten()):
ax.scatter(
x=tsPctChange[[filteredRegResults['index'][i]]],
y=netStatsWidePctChange[[(f"{filteredRegResults['index'][i].split('-')[0]}", f"{filteredRegResults['ns'][i]}")]])
# ax.suptitle(f"{filteredRegResults['index'][i]} Exports to {filteredRegResults['index'][i][1]} and {filteredRegResults['ns'][i]}")
ax.set_title(f"pvalue:{np.round(filteredRegResults['pvalue'][i], 4)}, r2:{np.round(filteredRegResults['r2'][i], 2)}, aic:{np.round(filteredRegResults['aic'][i], 2)}")
ax.set_ylabel(f"{filteredRegResults['ns'][i]} Percent Change")
ax.set_xlabel(f"{filteredRegResults['index'][i]} Percent Change")
plt.tight_layout()
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.decomposition import PCA
from numpy.linalg import eig
# %%
importers=pd.Series(col.split('-')[0] for col in tsPctChange.columns).unique()
exporters=pd.Series(col.split('-')[1] for col in tsPctChange.columns).unique()
allEcons=sorted(set(list(importers) + list(exporters)))
ncols=5
nrows = ceil(len(allEcons) / ncols)
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
# for i, ax in enumerate(axes.flatten()):
def myplot(score,coeff, i, ax, tempSeries, labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
ax.scatter(xs * scalex,ys * scaley)
for i in range(n):
ax.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
ax.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
ax.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
ax.set_title(tempSeries)
for i, econax in enumerate(zip(allEcons, axes.flatten())):
tempSeries=econax[0]
ax=econax[1]
# tempSeries.split('-')[0] <- exporter, [1] <- importer
temp=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries]]
# https://stackoverflow.com/questions/50796024/feature-variable-importance-after-a-pca-analysis
if temp.shape[1] > 0:
X = temp
#In general a good idea is to scale the data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
pca = PCA()
x_new = pca.fit_transform(X)
#Call the function. Use only the 2 PCs.
myplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]), i, ax, tempSeries, [col[1] for col in temp])
plt.tight_layout()
Example of the Relationship between the original features and the principal components. The values can be interpreted as the correlation between the original feature and the component.
econ='Argentina'
temp=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == econ]]
scaler = StandardScaler()
scaledData = pd.DataFrame(scaler.fit_transform(temp))
pcaModel = PCA(n_components=3)
pcaModelFit = pcaModel.fit(scaledData)
principalComponents = pcaModelFit.transform(scaledData)
pcaModelFit.explained_variance_ratio_.sum()
loadings = pcaModelFit.components_.T * np.sqrt(pcaModelFit.explained_variance_)
loading_matrix = pd.DataFrame(loadings, index=temp.columns)
print(pcaModelFit.explained_variance_ratio_.sum())
loading_matrix.sort_values(by=[0], ascending=False)
0.7227717252387083
0 | 1 | 2 | ||
---|---|---|---|---|
index | variable | |||
Argentina | DEGREE_CENTRALITY | 0.954604 | 0.128942 | 0.040180 |
DEGREE | 0.948894 | 0.176514 | 0.059231 | |
CLOSENESS_CENTRALITY | 0.904727 | -0.136198 | -0.089371 | |
IN_DEGREE_CENTRALITY | 0.903900 | -0.108930 | -0.080641 | |
IN_DEGREE | 0.899232 | -0.047406 | -0.056052 | |
TRIANGLES | 0.889398 | 0.252981 | 0.033791 | |
PAGERANK | 0.059270 | -0.464178 | -0.388364 | |
AVERAGECLUSTCOEF | 0.047154 | 0.108996 | -0.881155 | |
KATZ | 0.012647 | -0.053412 | -0.066499 | |
NUM_EDGES | -0.006441 | 0.832203 | -0.374385 | |
NUM_NODES | -0.224252 | 0.776487 | 0.244186 | |
CLUSTCOEF | -0.840177 | 0.058003 | -0.234825 |
an attempt to interpret the principal components:
PC 0: Centrality/Degree measures -> "Connectivity"
PC 1: Macro features such as number of edges and nodes while negatively related to pagerank values
PC 2: A bit of everything, overlaps pagerank and number of edges/nodes which are clearly seperated in PC 1
econs=pd.Series(col for col in tsPctChange.columns).unique()
regResultsPCA=[]
for tempSeries in econs:
# network statistics for reference econ
X_econ=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries.split('-')[0]]]
# if there is data
if X_econ.shape[1] > 0:
# need to allNs for later
allNs=[col[1] for col in X_econ.columns]
X_econ.columns=allNs
scaler = StandardScaler()
scaledData = pd.DataFrame(scaler.fit_transform(X_econ))
##### PCA
# create model
n_components=3
pcaModel = PCA(n_components=n_components)
# fit model
pcaModelFit = pcaModel.fit(scaledData)
X_econ = pd.DataFrame(pcaModelFit.transform(scaledData), columns=[str(col) for col in range(n_components)])
# trade time series for reference econ
y=tsPctChange[[tempSeries]]
y.columns = ['_'.join(col) for col in y.columns]
X_econ.index=y.index
for tempNs in X_econ.columns:
# if tempNs in X.columns:
X = X_econ[tempNs]
X = sm.add_constant(X, has_constant='add')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
reg = sm.OLS(y_train, X_train).fit()
y_pred = reg.predict(X_test)
tempReturn=(pd.DataFrame({
'ns':reg.params.index[1],
'coef':reg.params[1],
'pvalue':reg.pvalues[1],
'r2':reg.rsquared,
'aic':reg.aic,
'mse':mean_squared_error(y_test, y_pred)},index=[tempSeries])
)
regResultsPCA.append(tempReturn)
regResultsPCA=pd.concat(regResultsPCA)
regResultsPCA.reset_index(inplace=True)
regResultsPCA[regResultsPCA.index.isin(regResultsPCA['pvalue'].nsmallest().index)]
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
867 | Italy-Austria | 0 | 0.006742 | 1.853681e-23 | 0.223638 | -1528.740775 | 0.000760 |
894 | Italy-France | 0 | 0.005892 | 3.648805e-22 | 0.211911 | -1608.181766 | 0.000866 |
897 | Italy-Germany | 0 | 0.004781 | 5.983764e-23 | 0.219047 | -1790.378996 | 0.000484 |
948 | Italy-Netherlands | 0 | 0.006450 | 4.852923e-22 | 0.210780 | -1533.830995 | 0.000644 |
1002 | Italy-United Kingdom | 0 | 0.005826 | 3.209635e-23 | 0.221491 | -1639.549949 | 0.000803 |
regResultsPCA[regResultsPCA.index.isin(regResultsPCA['r2'].nlargest().index)]
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
867 | Italy-Austria | 0 | 0.006742 | 1.853681e-23 | 0.223638 | -1528.740775 | 0.000760 |
894 | Italy-France | 0 | 0.005892 | 3.648805e-22 | 0.211911 | -1608.181766 | 0.000866 |
897 | Italy-Germany | 0 | 0.004781 | 5.983764e-23 | 0.219047 | -1790.378996 | 0.000484 |
948 | Italy-Netherlands | 0 | 0.006450 | 4.852923e-22 | 0.210780 | -1533.830995 | 0.000644 |
1002 | Italy-United Kingdom | 0 | 0.005826 | 3.209635e-23 | 0.221491 | -1639.549949 | 0.000803 |
regResultsPCA[regResultsPCA.index.isin(regResultsPCA['aic'].nsmallest().index)]
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
177 | Canada-United States | 0 | 0.000743 | 5.443885e-07 | 0.061794 | -2655.726957 | 0.000107 |
178 | Canada-United States | 1 | 0.000766 | 6.290972e-03 | 0.018792 | -2637.980310 | 0.000104 |
179 | Canada-United States | 2 | 0.001416 | 1.241709e-05 | 0.047375 | -2649.687178 | 0.000095 |
564 | Germany-Netherlands | 0 | 0.000564 | 6.635361e-03 | 0.018553 | -2453.067065 | 0.000099 |
565 | Germany-Netherlands | 1 | 0.001542 | 4.810872e-06 | 0.051754 | -2466.695017 | 0.000077 |
regResultsPCA[regResultsPCA.index.isin(regResultsPCA['mse'].nsmallest().index)]
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
179 | Canada-United States | 2 | 0.001416 | 0.000012 | 0.047375 | -2649.687178 | 0.000095 |
565 | Germany-Netherlands | 1 | 0.001542 | 0.000005 | 0.051754 | -2466.695017 | 0.000077 |
1242 | Netherlands-Germany | 0 | 0.000361 | 0.104435 | 0.006678 | -2429.682232 | 0.000069 |
1243 | Netherlands-Germany | 1 | 0.001509 | 0.000002 | 0.055816 | -2449.773089 | 0.000059 |
1244 | Netherlands-Germany | 2 | 0.001605 | 0.000004 | 0.052761 | -2448.493640 | 0.000055 |
regResultsPCA[regResultsPCA.index.isin(abs(regResultsPCA['coef']).nlargest().index)]
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
119 | Canada-Denmark | 2 | 0.026723 | 3.769401e-03 | 0.021100 | 7.875695 | 0.037339 |
640 | Greece-Egypt | 1 | -0.021450 | 4.405205e-02 | 0.010251 | 110.083040 | 0.015408 |
1433 | Portugal-Japan | 2 | -0.021662 | 1.224228e-02 | 0.015821 | -183.288771 | 0.016500 |
1566 | Sri Lanka-Japan | 0 | 0.024138 | 3.983916e-16 | 0.154936 | -634.561938 | 0.012071 |
1688 | Switzerland-New Zealand | 2 | 0.022450 | 7.622431e-07 | 0.060246 | -515.664428 | 0.056722 |
filteredregResultsPCA=regResultsPCA.query('pvalue<0.1 and r2>0.2')
filteredregResultsPCA.reset_index(drop=True, inplace=True)
filteredregResultsPCA
index | ns | coef | pvalue | r2 | aic | mse | |
---|---|---|---|---|---|---|---|
0 | Italy-Austria | 0 | 0.006742 | 1.853681e-23 | 0.223638 | -1528.740775 | 0.000760 |
1 | Italy-Denmark | 0 | 0.007898 | 1.670098e-21 | 0.205860 | -1361.671432 | 0.001131 |
2 | Italy-France | 0 | 0.005892 | 3.648805e-22 | 0.211911 | -1608.181766 | 0.000866 |
3 | Italy-Germany | 0 | 0.004781 | 5.983764e-23 | 0.219047 | -1790.378996 | 0.000484 |
4 | Italy-Netherlands | 0 | 0.006450 | 4.852923e-22 | 0.210780 | -1533.830995 | 0.000644 |
5 | Italy-United Kingdom | 0 | 0.005826 | 3.209635e-23 | 0.221491 | -1639.549949 | 0.000803 |
from math import ceil
ncols=4
nrows = ceil(filteredregResultsPCA.shape[0] / ncols)
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
for i, ax in enumerate(axes.flatten()):
if i < filteredregResultsPCA.shape[0]:
econ=filteredregResultsPCA['index'][i].split('-')[0]
temp=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == econ]]
if temp.shape[1] > 0:
scaler = StandardScaler()
scaledData = pd.DataFrame(scaler.fit_transform(temp))
##### PCA
# create model
pcaModel = PCA(n_components=3)
# fit model
pcaModelFit = pcaModel.fit(scaledData)
principalComponents = pcaModelFit.transform(scaledData)
ax.scatter(
x=tsPctChange[[filteredregResultsPCA['index'][i]]],
#y=netStatsWidePctChange[[(f"{filteredregResultsPCA['index'][i][0]}", f"{filteredregResultsPCA['ns'][i]}")]])
y=pd.DataFrame(principalComponents)[int(filteredregResultsPCA['ns'][i])]
)
# ax.set_suptitle(f"{filteredregResultsPCA['index'][i][0]} Exports to {filteredregResultsPCA['index'][i][1]} and {filteredregResultsPCA['ns'][i]}")
ax.set_title(f"pvalue:{np.round(filteredregResultsPCA['pvalue'][i], 5)}, r2:{np.round(filteredregResultsPCA['r2'][i], 2)}, aic:{np.round(filteredregResultsPCA['aic'][i], 2)}")
ax.set_ylabel(f"{filteredregResultsPCA['ns'][i]} Percent Change")
ax.set_xlabel(f"{filteredregResultsPCA['index'][i]}")
plt.tight_layout()