import numpy as np
from numpy.core.fromnumeric import sort
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.decomposition import PCA

from numpy.linalg import eig


mcl = pd.read_csv('countryList.csv',
                 dtype={'Name':'str',
                        'ISO':'int',
                        'IC':'str',
                        'Region':'str'})

mcl.head()


data = pd.read_csv(r'C:\Users\yosty\Desktop\Desktop_Folder\03 - Data\wto\newest\HS_M_0010.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

print("dimensions of raw data", data.shape)
data.head()

dimensions of raw data (7892410, 20)


print(f"There are {data['ProductOrSector'].nunique()} unique products being traded in the dataset.")

There are 96 unique products being traded in the dataset.


data['Year'].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 1996, 1997, 1998, 1999,
       2019, 2020], dtype=int64)


data['ReportingEconomy'].nunique()

137


data['PartnerEconomy'].nunique()

191


# remove these groups as this will lead to double counting
econData = pd.DataFrame(data
            .query('PartnerEconomy != "World"')
            .query('ReportingEconomy != "European Union"')
            .groupby(['Year', 'ReportingEconomy', 'ProductOrSector', 'PartnerEconomy'])['Value']
            .agg('sum')
).reset_index()

# pivot the data to wide
econDataWide = pd.pivot_table(econData,
                          columns= ['ReportingEconomy', 'PartnerEconomy'],
                          index='ProductOrSector',
                          values='Value',
                          aggfunc= np.sum)


# drop trading pairs that have more than 20% data missing
econDataWideNoNan = econDataWide.loc[:, econDataWide.isnull().mean() < .2]

print(f'Lost {np.round((1 - econDataWideNoNan.shape[1] / econDataWide.shape[1])*100, 2)}% of Columns as they had more than 20% of data missing')


econDataWideNoNan.head()

Lost 79.8% of Columns as they had more than 20% of data missing


econDataPercentTotal = econDataWideNoNan.melt(ignore_index=False).reset_index()

econDataPercentTotal['total'] = econDataPercentTotal.groupby(['ReportingEconomy', 'PartnerEconomy']).transform('sum')['value']

econDataPercentTotal['percent'] = econDataPercentTotal['value'] / econDataPercentTotal['total']

econDataPercentTotal['total'] = econDataPercentTotal.groupby(['ReportingEconomy', 'PartnerEconomy']).transform('sum')['percent']

econDataPercentTotal['total'].unique()

econDataPercentTotal.drop(['value', 'total'] ,axis=1)

econDataWide2 = pd.pivot_table(econDataPercentTotal,
                          columns= 'ProductOrSector',
                          index=['ReportingEconomy', 'PartnerEconomy'],
                          values='percent',
                          aggfunc= np.sum)

econDataWide2.head()


# define groups to use for plotting
groupings = pd.DataFrame(list(econDataWide2.index), columns= ['reporting', 'partner'])

groups = (pd.merge(left = groupings,
            right=mcl[['Name', 'IC', 'Region']],
            left_on='reporting',
            right_on='Name',
            how='left')
            .rename({'IC':'reportingIC', 'Region': 'reportingRegion'}, axis=1)
            .drop('Name', axis=1)
)
groups = (pd.merge(left = groups,
            right=mcl[['Name', 'IC', 'Region']],
            left_on='partner',
            right_on='Name',
            how='left')
            .rename({'IC':'partnerIC', 'Region': 'partnerRegion'}, axis=1)
            .drop('Name', axis=1)
)
groups['reportPartnerIC'] = groups['reportingIC'] + '-' + groups['partnerIC']
groups['reportPartnerRegion'] = groups['reportingRegion'] + '-' + groups['partnerRegion']


groupList = ['reportingRegion', 'partnerRegion',
            'reportingIC', 'partnerIC']

fig, axes = plt.subplots(2, 2, figsize=(15,15), dpi=800)



for i, group in enumerate(groupList):

    rowDict = {0:0, 1:0, 2:1, 3:1}
    colDict = {0:0, 1:1, 2:0, 3:1}

    col = colDict[i]
    row = rowDict[i]

    scaler = StandardScaler()

    scaledData = pd.DataFrame(scaler.fit_transform(econDataWide2))

    #####   PCA

    # create model
    pcaModel = PCA(n_components=2)

    # fit model
    pcaModelFit = pcaModel.fit(scaledData)
    principalComponents = pcaModelFit.transform(scaledData)

    # get data
    pca = pd.DataFrame(data = principalComponents, columns = ['One', 'Two'])

    #plt.figure()

    ax = sns.scatterplot(x='One', y='Two',
        data=pca,
        hue = groups[group],
        ax=axes[row,col])

    ax.set(xlabel = f'One: {np.round(pcaModelFit.explained_variance_[0],2)}',
           ylabel = f'Two: {np.round(pcaModelFit.explained_variance_[1],2)}')

    ax.set_title(f'PCA {group}, Total Variance Explained: \
{np.round(pcaModelFit.explained_variance_.sum(),2)}%')

    ax.legend(fontsize = 15,
                bbox_to_anchor= (1.03, 1),
                title=group,
                title_fontsize = 18,
                shadow = True,
                facecolor = 'white')

    if col == 0:
        ax.legend_.remove()


#ax.legend_.remove()

	Name	ISO	IC	Region
0	World	8	0	0
1	Austria	122	High income	Europe & Central Asia
2	Barbados	316	High income	Latin America & Caribbean
3	Belgium	124	High income	Europe & Central Asia
4	Bulgaria	918	Upper middle income	Europe & Central Asia

	Frequency	FrequencyCode	Indicator	IndicatorCategory	IndicatorCategoryCode	IndicatorCode	PartnerEconomy	PartnerEconomyCode	Period	PeriodCode	ProductOrSector	ProductOrSectorClassification	ProductOrSectorClassificationCode	ProductOrSectorCode	ReportingEconomy	ReportingEconomyCode	Unit	UnitCode	Value	Year
0	Annual	A	Bilateral imports by detailed HS codes (2,4,6 ...	Bilateral imports	TPM_HS	HS_M_0010	World	0	Annual	A	Dairy produce; birds' eggs; natural honey; edi...	Harmonized System	HS	4	Albania	8	US$	USD	5017815.0	2000
1	Annual	A	Bilateral imports by detailed HS codes (2,4,6 ...	Bilateral imports	TPM_HS	HS_M_0010	Austria	40	Annual	A	Dairy produce; birds' eggs; natural honey; edi...	Harmonized System	HS	4	Albania	8	US$	USD	240658.0	2000
2	Annual	A	Bilateral imports by detailed HS codes (2,4,6 ...	Bilateral imports	TPM_HS	HS_M_0010	Barbados	52	Annual	A	Dairy produce; birds' eggs; natural honey; edi...	Harmonized System	HS	4	Albania	8	US$	USD	40269.0	2000
3	Annual	A	Bilateral imports by detailed HS codes (2,4,6 ...	Bilateral imports	TPM_HS	HS_M_0010	Belgium	56	Annual	A	Dairy produce; birds' eggs; natural honey; edi...	Harmonized System	HS	4	Albania	8	US$	USD	136658.0	2000
4	Annual	A	Bilateral imports by detailed HS codes (2,4,6 ...	Bilateral imports	TPM_HS	HS_M_0010	Bulgaria	100	Annual	A	Dairy produce; birds' eggs; natural honey; edi...	Harmonized System	HS	4	Albania	8	US$	USD	187198.0	2000

ReportingEconomy	Albania										...	Zimbabwe
PartnerEconomy	Austria	Barbados	Belgium	Bosnia and Herzegovina	Bulgaria	Canada	China	Croatia	Czech Republic	Denmark	...	South Africa	Spain	Sweden	Switzerland	Tanzania	Thailand	United Arab Emirates	United Kingdom	United States of America	Zambia
ProductOrSector
Aircraft, spacecraft, and parts thereof	6367464.0	284440.0	62368.0	NaN	131285.0	76607.0	64516.0	1875040.0	58062.0	60879.0	...	4.030082e+07	741652.0	153838.0	59624.0	299.0	NaN	24807312.0	7686171.0	34299862.0	29263.0
Albuminoidal substances; modified starches; glues; enzymes	1131359.0	382418.0	536560.0	32453.0	773552.0	52211.0	5021175.0	218198.0	31217.0	3142365.0	...	8.773851e+07	54748.0	10956.0	247022.0	270.0	261274.0	634573.0	1049498.0	896952.0	289912.0
Aluminium and articles thereof	3569213.0	133269.0	11341345.0	1951007.0	7071805.0	1820559.0	48353809.0	8061058.0	801067.0	50957.0	...	1.717046e+08	250910.0	176154.0	1553101.0	9936.0	89740.0	850402.0	3743695.0	1631916.0	2778835.0
Animal or vegetable fats and oils and their cleavage products; prepared edible fats; animal or vegetable waxes	628411.0	491635.0	2926745.0	5310166.0	72133445.0	198000.0	2464104.0	163554.0	42536.0	168406.0	...	1.339048e+09	167836.0	377.0	8314900.0	81323.0	27.0	3563146.0	107873.0	18475382.0	2762538.0
Arms and ammunition; parts and accessories thereof	93855.0	653.0	NaN	NaN	12847.0	NaN	44945.0	5967.0	680.0	200.0	...	5.601368e+06	555793.0	277279.0	2901538.0	378.0	NaN	149364.0	1018973.0	817551.0	69629.0

	ProductOrSector	Aircraft, spacecraft, and parts thereof	Albuminoidal substances; modified starches; glues; enzymes	Aluminium and articles thereof	Animal or vegetable fats and oils and their cleavage products; prepared edible fats; animal or vegetable waxes	Arms and ammunition; parts and accessories thereof	Articles of apparel and clothing accessories, knitted or crocheted	Articles of apparel and clothing accessories, not knitted or crocheted	Articles of iron or steel	Articles of leather; saddlery and harness; travel goods, handbags and similar containers; articles of animal gut (other than silkworm gut)	Articles of stone, plaster, cement, asbestos, mica or similar materials	...	Tools, implements, cutlery, spoons and forks, of base metal; parts thereof of base metal	Toys, games and sports requisites; parts and accessories thereof	Umbrellas, sun umbrellas, walkingsticks, seatsticks, whips, ridingcrops and parts thereof	Vegetable plaiting materials; vegetable products not elsewhere specified or included	Vehicles other than railway or tramway rollingstock, and parts and accessories thereof	Wadding, felt and nonwovens; special yarns; twine, cordage, ropes and cables and articles thereof	Wood and articles of wood; wood charcoal	Wool, fine or coarse animal hair; horsehair yarn and woven fabric	Works of art, collectors' pieces and antiques	Zinc and articles thereof
ReportingEconomy	PartnerEconomy
Albania	Austria	0.007191	0.001278	0.004031	0.000710	0.000106	0.000479	0.000553	0.048062	0.000147	0.004221	...	0.002282	0.034772	7.642895e-04	0.000000	0.019462	0.001171	0.034367	0.000013	2.269703e-04	6.868548e-05
	Barbados	0.002933	0.003943	0.001374	0.005069	0.000007	0.000472	0.000757	0.019468	0.000640	0.000197	...	0.001083	0.000689	4.227333e-07	0.000001	0.115439	0.000035	0.008318	0.000060	1.752797e-06	4.330439e-07
	Belgium	0.000253	0.002180	0.046081	0.011892	0.000000	0.000212	0.000353	0.004463	0.000326	0.000321	...	0.000626	0.001430	6.565973e-06	0.000025	0.119086	0.000661	0.004774	0.000000	9.418270e-06	6.098797e-04
	Bosnia and Herzegovina	0.000000	0.000097	0.005814	0.015824	0.000000	0.000283	0.000239	0.038048	0.000023	0.002767	...	0.000046	0.002633	1.609544e-02	0.000000	0.002667	0.000033	0.241463	0.000000	7.569211e-07	0.000000e+00
	Bulgaria	0.000115	0.000675	0.006170	0.062931	0.000011	0.005991	0.002089	0.025124	0.000289	0.012392	...	0.000293	0.001725	2.350474e-03	0.000003	0.007457	0.001814	0.030793	0.005821	2.285757e-07	8.908345e-06

Load and Clean Data¶

Pool by Year¶

Remove Trading Pairs without much activity¶

Convert data to percent of total basket¶

Create economic groups for plotting¶

Chart¶