How does the representation change as an encoder gets deeper? Bilateral product level trade baskets are fed into encoders of different sizes. A hold out dataset is then encoded and viewed by their distributions and using PCA, and tSNE.
from base64 import encode
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from math import ceil
seed = 11
rand_state = 11
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
Num GPUs Available: 0
########## load data
dir=r'C:\Users\yosty\Desktop\Desktop_Folder\03 - Data\wto\newest'
data = pd.read_csv(os.path.join(dir,'HS_M_0010.csv'))
mcl = pd.read_csv('countryList.csv',
dtype={'Name':'str',
'ISO':'int',
'IC':'str',
'Region':'str'})
encodeData = pd.pivot_table(data[data['Year'].isin(range(2000,2021))],
index= ['ReportingEconomy', 'PartnerEconomy', 'Year'],
columns='ProductOrSector',
values='Value',
aggfunc= np.sum).fillna(0)
encodeData.head()
ProductOrSector | Aircraft, spacecraft, and parts thereof | Albuminoidal substances; modified starches; glues; enzymes | Aluminium and articles thereof | Animal or vegetable fats and oils and their cleavage products; prepared edible fats; animal or vegetable waxes | Arms and ammunition; parts and accessories thereof | Articles of apparel and clothing accessories, knitted or crocheted | Articles of apparel and clothing accessories, not knitted or crocheted | Articles of iron or steel | Articles of leather; saddlery and harness; travel goods, handbags and similar containers; articles of animal gut (other than silkworm gut) | Articles of stone, plaster, cement, asbestos, mica or similar materials | ... | Tools, implements, cutlery, spoons and forks, of base metal; parts thereof of base metal | Toys, games and sports requisites; parts and accessories thereof | Umbrellas, sun umbrellas, walkingsticks, seatsticks, whips, ridingcrops and parts thereof | Vegetable plaiting materials; vegetable products not elsewhere specified or included | Vehicles other than railway or tramway rollingstock, and parts and accessories thereof | Wadding, felt and nonwovens; special yarns; twine, cordage, ropes and cables and articles thereof | Wood and articles of wood; wood charcoal | Wool, fine or coarse animal hair; horsehair yarn and woven fabric | Works of art, collectors' pieces and antiques | Zinc and articles thereof | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ReportingEconomy | PartnerEconomy | Year | |||||||||||||||||||||
Albania | Afghanistan | 2005 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2008 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 19.0 | 0.0 | ||
2010 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 193.0 | 847.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 150.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ||
2012 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2735.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ||
2013 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 96 columns
econDataPercentTotal = encodeData.melt(ignore_index=False).reset_index()
econDataPercentTotal['total'] = econDataPercentTotal.groupby(['ReportingEconomy', 'PartnerEconomy', 'Year']).transform('sum')['value']
econDataPercentTotal['percent'] = econDataPercentTotal['value'] / econDataPercentTotal['total']
econDataPercentTotal.drop(['value', 'total'] ,axis=1)
econDataPercentTotal = pd.pivot_table(econDataPercentTotal,
columns= 'ProductOrSector',
index=['ReportingEconomy', 'PartnerEconomy', 'Year'],
values='percent',
aggfunc= np.sum)
econDataPercentTotal.head()
ProductOrSector | Aircraft, spacecraft, and parts thereof | Albuminoidal substances; modified starches; glues; enzymes | Aluminium and articles thereof | Animal or vegetable fats and oils and their cleavage products; prepared edible fats; animal or vegetable waxes | Arms and ammunition; parts and accessories thereof | Articles of apparel and clothing accessories, knitted or crocheted | Articles of apparel and clothing accessories, not knitted or crocheted | Articles of iron or steel | Articles of leather; saddlery and harness; travel goods, handbags and similar containers; articles of animal gut (other than silkworm gut) | Articles of stone, plaster, cement, asbestos, mica or similar materials | ... | Tools, implements, cutlery, spoons and forks, of base metal; parts thereof of base metal | Toys, games and sports requisites; parts and accessories thereof | Umbrellas, sun umbrellas, walkingsticks, seatsticks, whips, ridingcrops and parts thereof | Vegetable plaiting materials; vegetable products not elsewhere specified or included | Vehicles other than railway or tramway rollingstock, and parts and accessories thereof | Wadding, felt and nonwovens; special yarns; twine, cordage, ropes and cables and articles thereof | Wood and articles of wood; wood charcoal | Wool, fine or coarse animal hair; horsehair yarn and woven fabric | Works of art, collectors' pieces and antiques | Zinc and articles thereof | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ReportingEconomy | PartnerEconomy | Year | |||||||||||||||||||||
Albania | Afghanistan | 2005 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2008 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | ||
2010 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.103485 | 0.454155 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.080429 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ||
2012 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.957968 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ||
2013 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 96 columns
econDataPercentTotal.mean().sort_values().nlargest()
ProductOrSector Nuclear reactors, boilers, machinery and mechanical appliances; parts thereof 0.093345 Electrical machinery and equipment and parts thereof; sound recorders and reproducers, television image and sound recorders and reproducers, and parts and accessories of such articles 0.085582 Mineral fuels, mineral oils and products of their distillation; bituminous substances; mineral waxes 0.058911 Vehicles other than railway or tramway rollingstock, and parts and accessories thereof 0.042113 Pharmaceutical products 0.031543 dtype: float64
class shallow(Model):
def __init__(self, latent_dim=10):
super(shallow, self).__init__()
self.latent_dim = latent_dim
self.encoder = tf.keras.Sequential([
Dense(8, activation="relu")])
self.decoder = tf.keras.Sequential([
Dense(96, activation="sigmoid")])
def call(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return decoded
class deep(Model):
def __init__(self, latent_dim=10):
super(deep, self).__init__()
self.latent_dim = latent_dim
self.encoder = tf.keras.Sequential([
Dense(48, activation="relu"),
Dense(24, activation="relu"),
Dense(8, activation="relu")])
self.decoder = tf.keras.Sequential([
Dense(24, activation="relu"),
Dense(48, activation="relu"),
Dense(96, activation="sigmoid")])
def call(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return decoded
class deeep(Model):
def __init__(self, latent_dim=10):
super(deeep, self).__init__()
self.latent_dim = latent_dim
self.encoder = tf.keras.Sequential([
Dense(88, activation="relu"),
Dense(80, activation="relu"),
Dense(72, activation="relu"),
Dense(64, activation="relu"),
Dense(56, activation="relu"),
Dense(48, activation="relu"),
Dense(40, activation="relu"),
Dense(32, activation="relu"),
Dense(24, activation="relu"),
Dense(16, activation="relu"),
Dense(8, activation="relu")])
self.decoder = tf.keras.Sequential([
Dense(16, activation="relu"),
Dense(24, activation="relu"),
Dense(32, activation="relu"),
Dense(40, activation="relu"),
Dense(48, activation="relu"),
Dense(56, activation="relu"),
Dense(64, activation="relu"),
Dense(72, activation="relu"),
Dense(80, activation="relu"),
Dense(88, activation="relu"),
Dense(96, activation="sigmoid")])
def call(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return decoded
def pca_plot(score, coeff, pcaModelFit, labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
ax = sns.scatterplot(
x=xs * scalex,
y=ys * scaley)
ax.set(xlabel = f'One: {np.round(pcaModelFit.explained_variance_[0],2)}',
ylabel = f'Two: {np.round(pcaModelFit.explained_variance_[1],2)}')
ax.set_title(f'PCA Total Variance Explained: \
{np.round(pcaModelFit.explained_variance_[0] + pcaModelFit.explained_variance_[1],2)}%')
ax.legend(fontsize = 15,
bbox_to_anchor= (1.03, 1),
title_fontsize = 18,
shadow = True,
facecolor = 'white')
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
return ax
def distributions_plots(X, title=None):
ncols=3
nrows = ceil(len(X.columns) / ncols)
width = ncols * 5
length = nrows * 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, dpi=120, figsize=(width,length))
for i in range(len(X.columns)):
axes.flatten()[i].hist(X.iloc[:, i], bins=100)
fig.suptitle(f"Encoding Distributions: {title}")
scaler= MinMaxScaler()
X= scaler.fit_transform(econDataPercentTotal)
X_train, X_test = train_test_split(X, test_size=.1, random_state=42, shuffle=True)
models= [shallow, deep, deeep]
for i, model in enumerate(models):
print(model)
autoencoder = model()
autoencoder.compile(optimizer='adam', loss='mae')
autoencoder.fit(X_train, X_train,
epochs=10,
shuffle=True)
encoded = pd.DataFrame(autoencoder.encoder(X_test).numpy())
plt.figure()
distributions_plots(encoded, title=model)
pcaModel = PCA()
pcaModelFit = pcaModel.fit(encoded)
principalComponents = pcaModelFit.transform(encoded)
plt.figure()
pca_plot(
score= principalComponents[:,0:2],
coeff= np.transpose(pcaModel.components_[0:2, :]),
pcaModelFit= pcaModelFit)
tsne_results = TSNE(
n_components=2,
perplexity=np.round(encoded.shape[0]**0.5, 0),
learning_rate='auto',
n_iter=1000,
init='pca',
n_jobs=16).fit_transform(encoded)
plt.figure()
sns.scatterplot(
x=tsne_results[:,0],
y=tsne_results[:,1]).set_title(f"tsne of embeddings {model}")
<class '__main__.shallow'> Epoch 1/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0249 Epoch 2/10 7030/7030 [==============================] - 11s 2ms/step - loss: 0.0095 Epoch 3/10 7030/7030 [==============================] - 11s 2ms/step - loss: 0.0090 Epoch 4/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0087 Epoch 5/10 7030/7030 [==============================] - 9s 1ms/step - loss: 0.0084 Epoch 6/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0084 Epoch 7/10 7030/7030 [==============================] - 11s 2ms/step - loss: 0.0082 Epoch 8/10 7030/7030 [==============================] - 9s 1ms/step - loss: 0.0082 Epoch 9/10 7030/7030 [==============================] - 9s 1ms/step - loss: 0.0082 Epoch 10/10 7030/7030 [==============================] - 9s 1ms/step - loss: 0.0082
No handles with labels found to put in legend. c:\Users\yosty\Envs\python\lib\site-packages\sklearn\manifold\_t_sne.py:982: FutureWarning: The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence. warnings.warn(
<class '__main__.deep'> Epoch 1/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0129 Epoch 2/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0085 Epoch 3/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0082 Epoch 4/10 7030/7030 [==============================] - 11s 2ms/step - loss: 0.0081 Epoch 5/10 7030/7030 [==============================] - 11s 2ms/step - loss: 0.0079 Epoch 6/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0078 Epoch 7/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0078 Epoch 8/10 7030/7030 [==============================] - 11s 1ms/step - loss: 0.0074 Epoch 9/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0071 Epoch 10/10 7030/7030 [==============================] - 10s 1ms/step - loss: 0.0069
No handles with labels found to put in legend. c:\Users\yosty\Envs\python\lib\site-packages\sklearn\manifold\_t_sne.py:982: FutureWarning: The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence. warnings.warn(
<class '__main__.deeep'> Epoch 1/10 7030/7030 [==============================] - 21s 3ms/step - loss: 0.0121 Epoch 2/10 7030/7030 [==============================] - 20s 3ms/step - loss: 0.0104 Epoch 3/10 7030/7030 [==============================] - 22s 3ms/step - loss: 0.0104 Epoch 4/10 7030/7030 [==============================] - 24s 3ms/step - loss: 0.0104 Epoch 5/10 7030/7030 [==============================] - 22s 3ms/step - loss: 0.0104 Epoch 6/10 7030/7030 [==============================] - 22s 3ms/step - loss: 0.0104 Epoch 7/10 7030/7030 [==============================] - 23s 3ms/step - loss: 0.0104 Epoch 8/10 7030/7030 [==============================] - 22s 3ms/step - loss: 0.0104 Epoch 9/10 7030/7030 [==============================] - 19s 3ms/step - loss: 0.0104 Epoch 10/10 7030/7030 [==============================] - 21s 3ms/step - loss: 0.0104
No handles with labels found to put in legend. c:\Users\yosty\Envs\python\lib\site-packages\sklearn\manifold\_t_sne.py:982: FutureWarning: The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence. warnings.warn(
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>