Feature Engineering Trade Network Statistics
-
-
In previous work I've calculated some basic network statistics on IMF Direction of Trade Statistics (DOTS) export data.
In this notebook I'll see if this data has any relationship with percent change bilateral export series. TLDR: currently no linear relationships
Table of Contents:
Collapse network statistics with PCA, repeat 2,3,4 on PCA series
improvements / future work:
validate feature importance
use univariate non-linear models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
timeSeries=(pd.read_csv('dotsTimeSeries.csv')
    .pivot_table(index='period', columns=['ReferenceArea', 'CounterpartReferenceArea'], values='value')
)
tsPctChange=np.log(timeSeries).pct_change().iloc[1:].dropna(axis=1)
tsPctChange.columns=['-'.join(col) for col in tsPctChange.columns]
tsPctChange[tsPctChange>1.5]=np.nan
tsPctChange[tsPctChange<-1.5]=np.nan
tsPctChange=tsPctChange.dropna(axis=1)
tsPctChange.index=pd.to_datetime(tsPctChange.index)
tsPctChange=tsPctChange[tsPctChange.index > '1985-01-01']
netStats=pd.read_csv('DOTSnetStats.csv').drop(['Unnamed: 0', 'CONNECTIVITY', 'HAS_BRIDGE', 'TOTAL_NET_VALUE', 'PAGERANK_NUMPY'],axis=1)
netStats.set_index(['index', 'PERIOD'], inplace=True)
# get to period index and econ, stats cols
netStatsWide=(netStats
.reset_index()
.melt(id_vars=['index', 'PERIOD'])
.pivot_table(index='PERIOD', columns=['index', 'variable'], values='value')
)
netStatsWide.index = pd.to_datetime(netStatsWide.index)
netStatsWidePctChange=netStatsWide.pct_change().iloc[1:].dropna(axis=1)
netStatsWidePctChange.index=pd.to_datetime(netStatsWidePctChange.index)
netStatsWidePctChange=netStatsWidePctChange[netStatsWidePctChange.index > '1985-01-01']
netStats.corr()
| DEGREE | IN_DEGREE | OUT_DEGREE | DEGREE_CENTRALITY | IN_DEGREE_CENTRALITY | OUT_DEGREE_CENTRALITY | AVG_NEIGHBOR_DEGREE | PAGERANK | KATZ | CLOSENESS_CENTRALITY | BETWEENNESS_CENTRALITY | CLUSTCOEF | NUM_NODES | NUM_EDGES | AVERAGECLUSTCOEF | TRIANGLES | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| DEGREE | 1.000000 | 0.905947 | 0.962545 | 0.992468 | 0.907749 | 0.943530 | 0.539215 | 0.342109 | 0.030614 | 0.779620 | 0.579524 | -0.582810 | 0.560734 | 0.578295 | -0.366186 | 0.958205 | 
| IN_DEGREE | 0.905947 | 1.000000 | 0.757224 | 0.879958 | 0.993021 | 0.722659 | 0.424597 | 0.352711 | 0.044777 | 0.888389 | 0.435932 | -0.439187 | 0.675427 | 0.696580 | -0.441086 | 0.926553 | 
| OUT_DEGREE | 0.962545 | 0.757224 | 1.000000 | 0.967567 | 0.764474 | 0.992800 | 0.559940 | 0.301901 | 0.018554 | 0.633813 | 0.614865 | -0.617850 | 0.432515 | 0.446061 | -0.282453 | 0.884874 | 
| DEGREE_CENTRALITY | 0.992468 | 0.879958 | 0.967567 | 1.000000 | 0.894707 | 0.962756 | 0.512342 | 0.380614 | 0.044864 | 0.748006 | 0.642281 | -0.633303 | 0.491743 | 0.508283 | -0.343765 | 0.922991 | 
| IN_DEGREE_CENTRALITY | 0.907749 | 0.993021 | 0.764474 | 0.894707 | 1.000000 | 0.740622 | 0.395934 | 0.401356 | 0.059710 | 0.873463 | 0.493825 | -0.480728 | 0.611031 | 0.631584 | -0.427157 | 0.905375 | 
| OUT_DEGREE_CENTRALITY | 0.943530 | 0.722659 | 0.992800 | 0.962756 | 0.740622 | 1.000000 | 0.531070 | 0.329623 | 0.031346 | 0.596529 | 0.667287 | -0.661709 | 0.369877 | 0.382318 | -0.258572 | 0.840450 | 
| AVG_NEIGHBOR_DEGREE | 0.539215 | 0.424597 | 0.559940 | 0.512342 | 0.395934 | 0.531070 | 1.000000 | -0.110202 | -0.009636 | 0.526910 | 0.076573 | -0.222635 | 0.560895 | 0.577173 | -0.443239 | 0.484496 | 
| PAGERANK | 0.342109 | 0.352711 | 0.301901 | 0.380614 | 0.401356 | 0.329623 | -0.110202 | 1.000000 | 0.109268 | 0.213201 | 0.603015 | -0.398695 | -0.077618 | -0.073561 | 0.041703 | 0.276802 | 
| KATZ | 0.030614 | 0.044777 | 0.018554 | 0.044864 | 0.059710 | 0.031346 | -0.009636 | 0.109268 | 1.000000 | 0.024699 | 0.084676 | -0.088603 | -0.013311 | -0.012970 | 0.032814 | 0.003484 | 
| CLOSENESS_CENTRALITY | 0.779620 | 0.888389 | 0.633813 | 0.748006 | 0.873463 | 0.596529 | 0.526910 | 0.213201 | 0.024699 | 1.000000 | 0.327792 | -0.342888 | 0.804916 | 0.828565 | -0.733591 | 0.785403 | 
| BETWEENNESS_CENTRALITY | 0.579524 | 0.435932 | 0.614865 | 0.642281 | 0.493825 | 0.667287 | 0.076573 | 0.603015 | 0.084676 | 0.327792 | 1.000000 | -0.577527 | 0.011448 | 0.014592 | -0.088210 | 0.417665 | 
| CLUSTCOEF | -0.582810 | -0.439187 | -0.617850 | -0.633303 | -0.480728 | -0.661709 | -0.222635 | -0.398695 | -0.088603 | -0.342888 | -0.577527 | 1.000000 | -0.101121 | -0.111978 | 0.185684 | -0.456913 | 
| NUM_NODES | 0.560734 | 0.675427 | 0.432515 | 0.491743 | 0.611031 | 0.369877 | 0.560895 | -0.077618 | -0.013311 | 0.804916 | 0.011448 | -0.101121 | 1.000000 | 0.969029 | -0.544588 | 0.620465 | 
| NUM_EDGES | 0.578295 | 0.696580 | 0.446061 | 0.508283 | 0.631584 | 0.382318 | 0.577173 | -0.073561 | -0.012970 | 0.828565 | 0.014592 | -0.111978 | 0.969029 | 1.000000 | -0.603055 | 0.652760 | 
| AVERAGECLUSTCOEF | -0.366186 | -0.441086 | -0.282453 | -0.343765 | -0.427157 | -0.258572 | -0.443239 | 0.041703 | 0.032814 | -0.733591 | -0.088210 | 0.185684 | -0.544588 | -0.603055 | 1.000000 | -0.359057 | 
| TRIANGLES | 0.958205 | 0.926553 | 0.884874 | 0.922991 | 0.905375 | 0.840450 | 0.484496 | 0.276802 | 0.003484 | 0.785403 | 0.417665 | -0.456913 | 0.620465 | 0.652760 | -0.359057 | 1.000000 | 
netStatsWidePctChange.head()
| index | Afghanistan | ... | Yemen, P.D. Rep. | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| variable | AVERAGECLUSTCOEF | CLOSENESS_CENTRALITY | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | ... | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | PAGERANK | TRIANGLES | 
| PERIOD | |||||||||||||||||||||
| 1985-02-01 | 0.006149 | 0.008230 | -0.017379 | 0.019608 | 0.019608 | 0.043478 | 0.043478 | -0.304667 | 0.002810 | 0.0 | ... | -0.062441 | -0.018868 | -0.018868 | -0.028571 | -0.028571 | 0.017828 | 0.002810 | 0.0 | -0.066568 | 0.024390 | 
| 1985-03-01 | -0.001313 | 0.000000 | -0.027077 | 0.057692 | 0.057692 | 0.041667 | 0.041667 | -0.090895 | 0.005324 | 0.0 | ... | 0.037767 | 0.115385 | 0.115385 | 0.147059 | 0.147059 | -0.079270 | 0.005324 | 0.0 | 0.061019 | 0.214286 | 
| 1985-04-01 | 0.009252 | 0.012500 | 0.059892 | -0.072727 | -0.072727 | 0.040000 | 0.040000 | 1.068448 | 0.005575 | 0.0 | ... | -0.006298 | -0.051724 | -0.051724 | -0.076923 | -0.076923 | 0.120605 | 0.005575 | 0.0 | -0.253603 | -0.133795 | 
| 1985-05-01 | 0.002814 | 0.008403 | 0.021888 | 0.058824 | 0.058824 | 0.076923 | 0.076923 | 0.021696 | 0.009286 | 0.0 | ... | 0.017743 | 0.036364 | 0.036364 | 0.055556 | 0.055556 | -0.094392 | 0.009286 | 0.0 | 0.052174 | 0.167776 | 
| 1985-06-01 | -0.015002 | -0.028571 | -0.044265 | -0.092593 | -0.092593 | -0.214286 | -0.214286 | 0.053878 | -0.008514 | 0.0 | ... | -0.000867 | -0.052632 | -0.052632 | -0.078947 | -0.078947 | -0.101800 | -0.008514 | 0.0 | 0.216340 | -0.168757 | 
5 rows × 1618 columns
netStatsWidePctChange.corr()
| index | Afghanistan | ... | Yemen, P.D. Rep. | |||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| variable | AVERAGECLUSTCOEF | CLOSENESS_CENTRALITY | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | ... | CLUSTCOEF | DEGREE | DEGREE_CENTRALITY | IN_DEGREE | IN_DEGREE_CENTRALITY | KATZ | NUM_EDGES | NUM_NODES | PAGERANK | TRIANGLES | |
| index | variable | |||||||||||||||||||||
| Afghanistan | AVERAGECLUSTCOEF | 1.000000 | 0.008563 | 0.268401 | 0.097012 | 0.112500 | -0.034662 | -0.013745 | -0.020592 | 0.258409 | -0.188789 | ... | 0.069385 | 0.034931 | 0.033566 | 0.022539 | 0.021497 | 0.012239 | 0.308246 | 0.063992 | -0.113081 | 0.018248 | 
| CLOSENESS_CENTRALITY | 0.008563 | 1.000000 | 0.069392 | 0.485661 | 0.487173 | 0.872371 | 0.877817 | -0.015789 | 0.275666 | 0.015062 | ... | -0.010067 | 0.068281 | 0.069322 | 0.035814 | 0.036912 | 0.031849 | 0.105024 | -0.061704 | -0.053218 | 0.037660 | |
| CLUSTCOEF | 0.268401 | 0.069392 | 1.000000 | -0.335240 | -0.322883 | 0.030909 | 0.045627 | -0.085917 | 0.027050 | -0.143091 | ... | 0.048122 | -0.067967 | -0.071540 | -0.016755 | -0.020086 | -0.015965 | 0.060949 | 0.178997 | -0.161005 | -0.040235 | |
| DEGREE | 0.097012 | 0.485661 | -0.335240 | 1.000000 | 0.997265 | 0.600504 | 0.602948 | 0.040934 | 0.229190 | 0.036113 | ... | 0.029834 | 0.054439 | 0.056968 | 0.037052 | 0.039531 | -0.035350 | 0.195018 | -0.133065 | 0.026756 | 0.036045 | |
| DEGREE_CENTRALITY | 0.112500 | 0.487173 | -0.322883 | 0.997265 | 1.000000 | 0.591579 | 0.599743 | 0.040700 | 0.193831 | -0.037386 | ... | 0.028419 | 0.054921 | 0.057617 | 0.037045 | 0.039675 | -0.035741 | 0.196222 | -0.141565 | 0.026265 | 0.036405 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | 
| Yemen, P.D. Rep. | KATZ | 0.012239 | 0.031849 | -0.015965 | -0.035350 | -0.035741 | 0.011687 | 0.011442 | -0.053146 | 0.018041 | 0.004437 | ... | 0.175340 | 0.135799 | 0.135630 | 0.075658 | 0.075625 | 1.000000 | 0.026173 | 0.002194 | -0.001177 | -0.026618 | 
| NUM_EDGES | 0.308246 | 0.105024 | 0.060949 | 0.195018 | 0.196222 | 0.142425 | 0.144518 | 0.077890 | 0.336385 | -0.007532 | ... | 0.137823 | 0.268694 | 0.268028 | 0.209588 | 0.209158 | 0.026173 | 1.000000 | 0.024900 | -0.057362 | 0.227043 | |
| NUM_NODES | 0.063992 | -0.061704 | 0.178997 | -0.133065 | -0.141565 | 0.006879 | -0.001520 | -0.005665 | 0.005408 | 0.112827 | ... | 0.189844 | -0.045144 | -0.064780 | 0.017885 | 0.000177 | 0.002194 | 0.024900 | 1.000000 | 0.047493 | -0.016247 | |
| PAGERANK | -0.113081 | -0.053218 | -0.161005 | 0.026756 | 0.026265 | -0.011613 | -0.012307 | 0.031506 | -0.014342 | 0.008425 | ... | 0.062099 | 0.223940 | 0.222865 | 0.309366 | 0.308527 | -0.001177 | -0.057362 | 0.047493 | 1.000000 | 0.320644 | |
| TRIANGLES | 0.018248 | 0.037660 | -0.040235 | 0.036045 | 0.036405 | 0.125626 | 0.127260 | 0.189249 | 0.077119 | -0.003858 | ... | 0.300027 | 0.861270 | 0.860715 | 0.906685 | 0.906874 | -0.026618 | 0.227043 | -0.016247 | 0.320644 | 1.000000 | |
1618 rows × 1618 columns
tsPctChange.head()
| Argentina-Brazil | Argentina-Chile | Argentina-Japan | Australia-Canada | Australia-China | Australia-France | Australia-Germany | Australia-India | Australia-Italy | Australia-Japan | ... | United Kingdom-Singapore | United Kingdom-Spain | United Kingdom-Sweden | United Kingdom-Switzerland | United Kingdom-Taiwan, Province of China | United Kingdom-Tanzania, United Republic of | United Kingdom-Thailand | United Kingdom-Turkey | United Kingdom-United Arab Emirates | United Kingdom-United States | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| period | |||||||||||||||||||||
| 1985-02-01 | 0.062522 | -0.154539 | -0.293125 | 0.142470 | 0.062065 | -0.097768 | 0.065731 | -0.131695 | 0.000170 | -0.004596 | ... | -0.036131 | -0.029949 | 0.006133 | 0.004249 | -0.030834 | -0.285420 | -0.033476 | -0.129838 | -0.018257 | -0.006231 | 
| 1985-03-01 | -0.157772 | -0.028198 | 0.374053 | 0.018934 | -0.088413 | -0.003325 | -0.037304 | 0.080223 | -0.137057 | -0.023654 | ... | 0.025743 | 0.069935 | 0.006004 | 0.013794 | 0.069704 | 0.261847 | 0.056897 | 0.113487 | 0.076519 | 0.018622 | 
| 1985-04-01 | 0.076867 | 0.130921 | 0.230518 | -0.087285 | 0.164218 | 0.056872 | -0.091518 | 0.128781 | 0.182579 | 0.051857 | ... | 0.035510 | -0.039669 | 0.050269 | 0.063971 | 0.030340 | 0.175368 | 0.097109 | 0.097419 | 0.006345 | 0.040990 | 
| 1985-05-01 | -0.079725 | -0.085492 | -0.072008 | 0.074769 | 0.035272 | -0.029843 | 0.165752 | 0.010152 | 0.004218 | -0.000145 | ... | 0.013411 | -0.008849 | -0.044910 | -0.018795 | 0.022171 | 0.196199 | -0.045813 | -0.046656 | -0.076937 | 0.022249 | 
| 1985-06-01 | 0.037981 | 0.148035 | 0.147914 | 0.027766 | 0.093546 | 0.115701 | -0.122772 | 0.011665 | -0.078161 | 0.012099 | ... | -0.056779 | 0.013070 | 0.005069 | 0.012590 | -0.035781 | -0.138247 | -0.038296 | 0.012301 | 0.063594 | -0.020316 | 
5 rows × 640 columns
importers=pd.Series(col.split('-')[0] for col in tsPctChange.columns).unique()
exporters=pd.Series(col.split('-')[1] for col in tsPctChange.columns).unique()
allEcons=sorted(set(list(importers) + list(exporters)))
netStats=pd.Series(col[1] for col in netStatsWidePctChange.columns).nunique()
print('The upper-bound on number of tests:', len(allEcons)*netStats)
The upper-bound on number of tests: 1216
econs=pd.Series(col for col in tsPctChange.columns).unique()
regResults=[]
for tempSeries in econs:
    # get exporter network data
    # if country in net stats equals [0] <- exporter, [1] <- importers
    X_econ=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries.split('-')[0]]]
    # network statistics availiable to exporter
    allNs=[col[1] for col in X_econ.columns]
    X_econ.columns=allNs
    # trade import series
    y=tsPctChange[[tempSeries]]
    y.columns = ['_'.join(col) for col in y.columns]
    for tempNs in allNs:
        X = X_econ[tempNs]
        X = sm.add_constant(X, has_constant='add')
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
        reg = sm.OLS(y_train, X_train).fit()
        y_pred = reg.predict(X_test)
        tempReturn=(pd.DataFrame({
            'ns':reg.params.index[1],
            'coef':reg.params[1],
            'pvalue':reg.pvalues[1],
            'r2':reg.rsquared,
            'aic':reg.aic,
            'mse':mean_squared_error(y_test, y_pred)},index=[tempSeries])
        )
        regResults.append(tempReturn)
regResults=pd.concat(regResults)
regResults.reset_index(inplace=True)
regResults[regResults.index.isin(regResults['pvalue'].nsmallest().index)]
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 1858 | France-Italy | PAGERANK | 0.366934 | 3.058292e-72 | 0.560019 | -1923.270376 | 0.000415 | 
| 4186 | Italy-Austria | PAGERANK | 0.294486 | 1.078482e-65 | 0.525103 | -1723.383263 | 0.000481 | 
| 4330 | Italy-France | PAGERANK | 0.281747 | 1.283553e-79 | 0.596303 | -1873.084271 | 0.000485 | 
| 4346 | Italy-Germany | PAGERANK | 0.219648 | 5.352173e-74 | 0.568945 | -2025.713641 | 0.000250 | 
| 4762 | Italy-Spain | PAGERANK | 0.332262 | 3.005676e-61 | 0.499840 | -1587.743934 | 0.000609 | 
regResults[regResults.index.isin(regResults['r2'].nlargest().index)]
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 1858 | France-Italy | PAGERANK | 0.366934 | 3.058292e-72 | 0.560019 | -1923.270376 | 0.000415 | 
| 4186 | Italy-Austria | PAGERANK | 0.294486 | 1.078482e-65 | 0.525103 | -1723.383263 | 0.000481 | 
| 4330 | Italy-France | PAGERANK | 0.281747 | 1.283553e-79 | 0.596303 | -1873.084271 | 0.000485 | 
| 4346 | Italy-Germany | PAGERANK | 0.219648 | 5.352173e-74 | 0.568945 | -2025.713641 | 0.000250 | 
| 4762 | Italy-Spain | PAGERANK | 0.332262 | 3.005676e-61 | 0.499840 | -1587.743934 | 0.000609 | 
regResults[regResults.index.isin(regResults['aic'].nsmallest().index)]
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 885 | Canada-United States | DEGREE | 0.098988 | 1.032284e-08 | 0.079926 | -2663.455321 | 0.000103 | 
| 886 | Canada-United States | DEGREE_CENTRALITY | 0.091503 | 6.636601e-08 | 0.071443 | -2659.820961 | 0.000104 | 
| 890 | Canada-United States | NUM_EDGES | 0.254925 | 9.073834e-10 | 0.090931 | -2668.220270 | 0.000076 | 
| 892 | Canada-United States | OUT_DEGREE | 0.067155 | 7.640546e-07 | 0.060235 | -2655.069445 | 0.000106 | 
| 895 | Canada-United States | TRIANGLES | 0.091143 | 5.187018e-09 | 0.083051 | -2664.802277 | 0.000095 | 
regResults[regResults.index.isin(regResults['mse'].nsmallest().index)]
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 6088 | Netherlands-Germany | AVERAGECLUSTCOEF | 0.328054 | 6.201868e-05 | 0.039943 | -2443.170822 | 0.000059 | 
| 6089 | Netherlands-Germany | AVG_NEIGHBOR_DEGREE | 0.163929 | 7.750331e-05 | 0.038913 | -2442.746399 | 0.000059 | 
| 6093 | Netherlands-Germany | DEGREE | 0.096945 | 9.414008e-03 | 0.016989 | -2433.814378 | 0.000067 | 
| 6098 | Netherlands-Germany | NUM_EDGES | 0.365567 | 8.341191e-12 | 0.111869 | -2474.008866 | 0.000045 | 
| 6103 | Netherlands-Germany | TRIANGLES | 0.253912 | 5.776218e-09 | 0.082563 | -2461.152722 | 0.000053 | 
filteredRegResults=regResults.query('pvalue<0.05 and r2>.5')
filteredRegResults.reset_index(drop=True, inplace=True)
filteredRegResults
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 0 | France-Italy | PAGERANK | 0.366934 | 3.058292e-72 | 0.560019 | -1923.270376 | 0.000415 | 
| 1 | Italy-Austria | PAGERANK | 0.294486 | 1.078482e-65 | 0.525103 | -1723.383263 | 0.000481 | 
| 2 | Italy-France | PAGERANK | 0.281747 | 1.283553e-79 | 0.596303 | -1873.084271 | 0.000485 | 
| 3 | Italy-Germany | PAGERANK | 0.219648 | 5.352173e-74 | 0.568945 | -2025.713641 | 0.000250 | 
Let's do a visual check of the series that came back with any remote form of a linear relationship.
We can see that many of the relationships are affected by outliers so these numbers are misleading.
from math import ceil
ncols=4
nrows = ceil(filteredRegResults.shape[0] / ncols)
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
for i, ax in enumerate(axes.flatten()):
    ax.scatter(
        x=tsPctChange[[filteredRegResults['index'][i]]],
        y=netStatsWidePctChange[[(f"{filteredRegResults['index'][i].split('-')[0]}", f"{filteredRegResults['ns'][i]}")]])
        # ax.suptitle(f"{filteredRegResults['index'][i]} Exports to {filteredRegResults['index'][i][1]} and {filteredRegResults['ns'][i]}")
    ax.set_title(f"pvalue:{np.round(filteredRegResults['pvalue'][i], 4)},  r2:{np.round(filteredRegResults['r2'][i], 2)},  aic:{np.round(filteredRegResults['aic'][i], 2)}")
    ax.set_ylabel(f"{filteredRegResults['ns'][i]} Percent Change")
    ax.set_xlabel(f"{filteredRegResults['index'][i]}  Percent Change")
plt.tight_layout()
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.decomposition import PCA
from numpy.linalg import eig
# %%
importers=pd.Series(col.split('-')[0] for col in tsPctChange.columns).unique()
exporters=pd.Series(col.split('-')[1] for col in tsPctChange.columns).unique()
allEcons=sorted(set(list(importers) + list(exporters)))
ncols=5
nrows = ceil(len(allEcons) / ncols)
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
# for i, ax in enumerate(axes.flatten()):
def myplot(score,coeff, i, ax, tempSeries, labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    ax.scatter(xs * scalex,ys * scaley)
    for i in range(n):
        ax.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            ax.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            ax.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
    ax.set_title(tempSeries)
for i, econax in enumerate(zip(allEcons, axes.flatten())):
    tempSeries=econax[0]
    ax=econax[1]
    # tempSeries.split('-')[0] <- exporter, [1] <- importer
    temp=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries]]
    # https://stackoverflow.com/questions/50796024/feature-variable-importance-after-a-pca-analysis
    if temp.shape[1] > 0:
        X = temp
        #In general a good idea is to scale the data
        scaler = StandardScaler()
        scaler.fit(X)
        X=scaler.transform(X)
        pca = PCA()
        x_new = pca.fit_transform(X)
        #Call the function. Use only the 2 PCs.
        myplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]), i, ax, tempSeries, [col[1] for col in temp])
    plt.tight_layout()
Example of the Relationship between the original features and the principal components. The values can be interpreted as the correlation between the original feature and the component.
econ='Argentina'
temp=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == econ]]
scaler = StandardScaler()
scaledData = pd.DataFrame(scaler.fit_transform(temp))
pcaModel = PCA(n_components=3)
pcaModelFit = pcaModel.fit(scaledData)
principalComponents = pcaModelFit.transform(scaledData)
pcaModelFit.explained_variance_ratio_.sum()
loadings = pcaModelFit.components_.T * np.sqrt(pcaModelFit.explained_variance_)
loading_matrix = pd.DataFrame(loadings, index=temp.columns)
print(pcaModelFit.explained_variance_ratio_.sum())
loading_matrix.sort_values(by=[0], ascending=False)
0.7227717252387083
| 0 | 1 | 2 | ||
|---|---|---|---|---|
| index | variable | |||
| Argentina | DEGREE_CENTRALITY | 0.954604 | 0.128942 | 0.040180 | 
| DEGREE | 0.948894 | 0.176514 | 0.059231 | |
| CLOSENESS_CENTRALITY | 0.904727 | -0.136198 | -0.089371 | |
| IN_DEGREE_CENTRALITY | 0.903900 | -0.108930 | -0.080641 | |
| IN_DEGREE | 0.899232 | -0.047406 | -0.056052 | |
| TRIANGLES | 0.889398 | 0.252981 | 0.033791 | |
| PAGERANK | 0.059270 | -0.464178 | -0.388364 | |
| AVERAGECLUSTCOEF | 0.047154 | 0.108996 | -0.881155 | |
| KATZ | 0.012647 | -0.053412 | -0.066499 | |
| NUM_EDGES | -0.006441 | 0.832203 | -0.374385 | |
| NUM_NODES | -0.224252 | 0.776487 | 0.244186 | |
| CLUSTCOEF | -0.840177 | 0.058003 | -0.234825 | 
an attempt to interpret the principal components:
PC 0: Centrality/Degree measures -> "Connectivity"
PC 1: Macro features such as number of edges and nodes while negatively related to pagerank values
PC 2: A bit of everything, overlaps pagerank and number of edges/nodes which are clearly seperated in PC 1
econs=pd.Series(col for col in tsPctChange.columns).unique()
regResultsPCA=[]
for tempSeries in econs:
    # network statistics for reference econ
    X_econ=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == tempSeries.split('-')[0]]]
    # if there is data
    if X_econ.shape[1] > 0:
        # need to allNs for later
        allNs=[col[1] for col in X_econ.columns]
        X_econ.columns=allNs
        scaler = StandardScaler()
        scaledData = pd.DataFrame(scaler.fit_transform(X_econ))
        #####   PCA
        # create model
        n_components=3
        pcaModel = PCA(n_components=n_components)
        # fit model
        pcaModelFit = pcaModel.fit(scaledData)
        X_econ = pd.DataFrame(pcaModelFit.transform(scaledData), columns=[str(col) for col in range(n_components)])
        # trade time series for reference econ
        y=tsPctChange[[tempSeries]]
        y.columns = ['_'.join(col) for col in y.columns]
        X_econ.index=y.index
        for tempNs in X_econ.columns:
            # if tempNs in X.columns:
            X = X_econ[tempNs]
            X = sm.add_constant(X, has_constant='add')
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
            reg = sm.OLS(y_train, X_train).fit()
            y_pred = reg.predict(X_test)
            tempReturn=(pd.DataFrame({
                'ns':reg.params.index[1],
                'coef':reg.params[1],
                'pvalue':reg.pvalues[1],
                'r2':reg.rsquared,
                'aic':reg.aic,
                'mse':mean_squared_error(y_test, y_pred)},index=[tempSeries])
            )
            regResultsPCA.append(tempReturn)
regResultsPCA=pd.concat(regResultsPCA)
regResultsPCA.reset_index(inplace=True)
regResultsPCA[regResultsPCA.index.isin(regResultsPCA['pvalue'].nsmallest().index)]
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 867 | Italy-Austria | 0 | 0.006742 | 1.853681e-23 | 0.223638 | -1528.740775 | 0.000760 | 
| 894 | Italy-France | 0 | 0.005892 | 3.648805e-22 | 0.211911 | -1608.181766 | 0.000866 | 
| 897 | Italy-Germany | 0 | 0.004781 | 5.983764e-23 | 0.219047 | -1790.378996 | 0.000484 | 
| 948 | Italy-Netherlands | 0 | 0.006450 | 4.852923e-22 | 0.210780 | -1533.830995 | 0.000644 | 
| 1002 | Italy-United Kingdom | 0 | 0.005826 | 3.209635e-23 | 0.221491 | -1639.549949 | 0.000803 | 
regResultsPCA[regResultsPCA.index.isin(regResultsPCA['r2'].nlargest().index)]
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 867 | Italy-Austria | 0 | 0.006742 | 1.853681e-23 | 0.223638 | -1528.740775 | 0.000760 | 
| 894 | Italy-France | 0 | 0.005892 | 3.648805e-22 | 0.211911 | -1608.181766 | 0.000866 | 
| 897 | Italy-Germany | 0 | 0.004781 | 5.983764e-23 | 0.219047 | -1790.378996 | 0.000484 | 
| 948 | Italy-Netherlands | 0 | 0.006450 | 4.852923e-22 | 0.210780 | -1533.830995 | 0.000644 | 
| 1002 | Italy-United Kingdom | 0 | 0.005826 | 3.209635e-23 | 0.221491 | -1639.549949 | 0.000803 | 
regResultsPCA[regResultsPCA.index.isin(regResultsPCA['aic'].nsmallest().index)]
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 177 | Canada-United States | 0 | 0.000743 | 5.443885e-07 | 0.061794 | -2655.726957 | 0.000107 | 
| 178 | Canada-United States | 1 | 0.000766 | 6.290972e-03 | 0.018792 | -2637.980310 | 0.000104 | 
| 179 | Canada-United States | 2 | 0.001416 | 1.241709e-05 | 0.047375 | -2649.687178 | 0.000095 | 
| 564 | Germany-Netherlands | 0 | 0.000564 | 6.635361e-03 | 0.018553 | -2453.067065 | 0.000099 | 
| 565 | Germany-Netherlands | 1 | 0.001542 | 4.810872e-06 | 0.051754 | -2466.695017 | 0.000077 | 
regResultsPCA[regResultsPCA.index.isin(regResultsPCA['mse'].nsmallest().index)]
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 179 | Canada-United States | 2 | 0.001416 | 0.000012 | 0.047375 | -2649.687178 | 0.000095 | 
| 565 | Germany-Netherlands | 1 | 0.001542 | 0.000005 | 0.051754 | -2466.695017 | 0.000077 | 
| 1242 | Netherlands-Germany | 0 | 0.000361 | 0.104435 | 0.006678 | -2429.682232 | 0.000069 | 
| 1243 | Netherlands-Germany | 1 | 0.001509 | 0.000002 | 0.055816 | -2449.773089 | 0.000059 | 
| 1244 | Netherlands-Germany | 2 | 0.001605 | 0.000004 | 0.052761 | -2448.493640 | 0.000055 | 
regResultsPCA[regResultsPCA.index.isin(abs(regResultsPCA['coef']).nlargest().index)]
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 119 | Canada-Denmark | 2 | 0.026723 | 3.769401e-03 | 0.021100 | 7.875695 | 0.037339 | 
| 640 | Greece-Egypt | 1 | -0.021450 | 4.405205e-02 | 0.010251 | 110.083040 | 0.015408 | 
| 1433 | Portugal-Japan | 2 | -0.021662 | 1.224228e-02 | 0.015821 | -183.288771 | 0.016500 | 
| 1566 | Sri Lanka-Japan | 0 | 0.024138 | 3.983916e-16 | 0.154936 | -634.561938 | 0.012071 | 
| 1688 | Switzerland-New Zealand | 2 | 0.022450 | 7.622431e-07 | 0.060246 | -515.664428 | 0.056722 | 
filteredregResultsPCA=regResultsPCA.query('pvalue<0.1 and r2>0.2')
filteredregResultsPCA.reset_index(drop=True, inplace=True)
filteredregResultsPCA
| index | ns | coef | pvalue | r2 | aic | mse | |
|---|---|---|---|---|---|---|---|
| 0 | Italy-Austria | 0 | 0.006742 | 1.853681e-23 | 0.223638 | -1528.740775 | 0.000760 | 
| 1 | Italy-Denmark | 0 | 0.007898 | 1.670098e-21 | 0.205860 | -1361.671432 | 0.001131 | 
| 2 | Italy-France | 0 | 0.005892 | 3.648805e-22 | 0.211911 | -1608.181766 | 0.000866 | 
| 3 | Italy-Germany | 0 | 0.004781 | 5.983764e-23 | 0.219047 | -1790.378996 | 0.000484 | 
| 4 | Italy-Netherlands | 0 | 0.006450 | 4.852923e-22 | 0.210780 | -1533.830995 | 0.000644 | 
| 5 | Italy-United Kingdom | 0 | 0.005826 | 3.209635e-23 | 0.221491 | -1639.549949 | 0.000803 | 
from math import ceil
ncols=4
nrows = ceil(filteredregResultsPCA.shape[0] / ncols)
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
for i, ax in enumerate(axes.flatten()):
    if i < filteredregResultsPCA.shape[0]:
        econ=filteredregResultsPCA['index'][i].split('-')[0]
        temp=netStatsWidePctChange[[col for col in netStatsWidePctChange.columns if col[0] == econ]]
        if temp.shape[1] > 0:
            scaler = StandardScaler()
            scaledData = pd.DataFrame(scaler.fit_transform(temp))
            #####   PCA
            # create model
            pcaModel = PCA(n_components=3)
            # fit model
            pcaModelFit = pcaModel.fit(scaledData)
            principalComponents = pcaModelFit.transform(scaledData)
            ax.scatter(
                x=tsPctChange[[filteredregResultsPCA['index'][i]]],
                #y=netStatsWidePctChange[[(f"{filteredregResultsPCA['index'][i][0]}", f"{filteredregResultsPCA['ns'][i]}")]])
                y=pd.DataFrame(principalComponents)[int(filteredregResultsPCA['ns'][i])]
                )
            # ax.set_suptitle(f"{filteredregResultsPCA['index'][i][0]} Exports to {filteredregResultsPCA['index'][i][1]} and {filteredregResultsPCA['ns'][i]}")
            ax.set_title(f"pvalue:{np.round(filteredregResultsPCA['pvalue'][i], 5)},  r2:{np.round(filteredregResultsPCA['r2'][i], 2)},  aic:{np.round(filteredregResultsPCA['aic'][i], 2)}")
            ax.set_ylabel(f"{filteredregResultsPCA['ns'][i]} Percent Change")
            ax.set_xlabel(f"{filteredregResultsPCA['index'][i]}")
plt.tight_layout()