# Database
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import geopandas as gpd
import geodatasets

# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Clustering
from sklearn.cluster import KMeans

# For reproducibility
RANDOM_STATE = 42

data_path = "data/climate_change_indicators.csv"

db = pd.read_csv(data_path)

db.head(2)

db.columns

Index(['ObjectId', 'Country', 'ISO2', 'ISO3', 'Indicator', 'Unit', 'Source',
       'CTS_Code', 'CTS_Name', 'CTS_Full_Descriptor', 'F1961', 'F1962',
       'F1963', 'F1964', 'F1965', 'F1966', 'F1967', 'F1968', 'F1969', 'F1970',
       'F1971', 'F1972', 'F1973', 'F1974', 'F1975', 'F1976', 'F1977', 'F1978',
       'F1979', 'F1980', 'F1981', 'F1982', 'F1983', 'F1984', 'F1985', 'F1986',
       'F1987', 'F1988', 'F1989', 'F1990', 'F1991', 'F1992', 'F1993', 'F1994',
       'F1995', 'F1996', 'F1997', 'F1998', 'F1999', 'F2000', 'F2001', 'F2002',
       'F2003', 'F2004', 'F2005', 'F2006', 'F2007', 'F2008', 'F2009', 'F2010',
       'F2011', 'F2012', 'F2013', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018',
       'F2019', 'F2020', 'F2021', 'F2022'],
      dtype='object')

def get_temps_country(country, db = db):
    return db.loc[db['Country'] == country, 'F1961':'F2022'].to_numpy().reshape(-1,1)

country = 'United Kingdom'

plt.plot(range(1961, 2023), get_temps_country(country))
plt.show()

for country in db['Country']:
    temps = get_temps_country(country)
    plt.scatter(range(1961, 2023), temps, alpha = 0.1, c = temps, cmap = 'rainbow')
plt.plot(range(1961, 2023), np.mean(db.loc[:,'F1961':'F2022'], axis = 0), c ='black')
plt.xlabel("Year")
plt.ylabel("Relative temperature (°C)")
plt.title("Average temperature in world countries per year, relative to the mean for 1951-1980")
plt.show()

l = []
for i in range(1961, 2019):
    temps_5_years = db.loc[:,'F' + str(i) : 'F' + str(i+4)].to_numpy()
    l.append(np.nanvar(temps_5_years.flatten()))
plt.plot(range(1963, 2021), l)
plt.xlabel("Year")
plt.ylabel("Variance in temperature")
plt.title("Variance in temperature in 5-year intervals over all couries")
plt.show()

X_train = np.tile(np.arange(1961, 2023), db.shape[0])
y_train = db.loc[:,'F1961':'F2022'].to_numpy().flatten()

# Getting the indices of NaN values
nanIndex = np.argwhere(np.isnan(y_train))
#Removing the corresponding entries
X_train = np.delete(X_train, nanIndex)
y_train = np.delete(y_train, nanIndex)

poly = PolynomialFeatures(degree = 2)
X_train_poly = poly.fit_transform(X_train.reshape(-1,1))

poly_reg_model = LinearRegression()
poly_reg_model.fit(X_train_poly, y_train)

LinearRegression()

LinearRegression()

last_year_predict = 2050

X_predict = np.arange(1961, last_year_predict + 1)
X_predict_poly = poly.fit_transform(X_predict.reshape(-1,1))
y_predict = poly_reg_model.predict(X_predict_poly)

plt.plot(X_predict, y_predict, c ='black')

plt.axhline(2.5, c = 'red', label = "2.5°C")
plt.axhline(2, c = 'orange', label = "2°C")
plt.axhline(1.5, c = 'green', label = "1.5°C")

for country in db['Country']:
    temps = get_temps_country(country)
    plt.scatter(range(1961, 2023), temps, alpha = 0.1, c = temps, cmap = 'rainbow')
plt.xlabel("Year")
plt.ylabel("Relative temperature (°C)")
plt.title("Average temperature in world countries per year, relative to the mean for 1951-1980")
plt.legend(loc = 'upper left')
plt.show()

d = {'Country': db['Country'],
     'ISO3': db['ISO3'],
     "First decade": db.loc[:, 'F1980':'F1989'].mean(axis = 1),
     "Last decade": db.loc[:, 'F2013':'F2022'].mean(axis = 1)}
db_tempdiff = pd.DataFrame(d)
db_tempdiff.head(2)

db_tempdiff = db_tempdiff.dropna()

db_tempdiff.plot.scatter("First decade", "Last decade")
plt.show()

do_elbow_method = False # Change to True to check results

if do_elbow_method:
    list_loss = []
    for k in range(1,6):
        kmeans = KMeans(n_clusters = k, n_init = 10, random_state = RANDOM_STATE)
        kmeans.fit(db_tempdiff2.loc[:, ["First decade", "Last decade"]])
        list_loss.append(kmeans.inertia_)
    
    plt.plot(range(1,6), list_loss, marker = 'o', linestyle = '-')
    plt.xlabel('Number of clusters')
    plt.ylabel('Loss')
    plt.title('Elbow method')
    plt.grid(True)
    plt.show()

kmeans = KMeans(n_clusters = 4, n_init = 10, random_state = RANDOM_STATE)
db_tempdiff['Cluster'] = kmeans.fit_predict(db_tempdiff.loc[:, ["First decade", "Last decade"]])
db_tempdiff.head(2)

sns.scatterplot(db_tempdiff, x = "First decade", y = "Last decade", hue = 'Cluster', palette = ['red','blue','orange','green'], alpha = 0.3, s = 20)
sns.scatterplot(x = kmeans.cluster_centers_[:, 0], y = kmeans.cluster_centers_[:, 1], c = ['red','blue','orange','green'], s = 50, marker = 'x', linewidth=3)
plt.legend().remove()
plt.show()

worldmap_path = "../tools/ne_110m_admin_0_countries.zip"
world = gpd.read_file(worldmap_path)
# Other (deprecated) method:
#world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
world.head(2)

# Replace 'ADM0_A3' with 'iso_a3' if using the deprecated map: gpd.read_file(gpd.datasets...)
world_clusters = pd.merge(world.rename(columns = {'ADM0_A3' : 'ISO3'}), db_tempdiff.loc[:, ['ISO3', 'Cluster']], how = 'left')
# If we want int for clusters
world_clusters['Cluster'] = world_clusters['Cluster'].fillna(4).astype('int')
colors = ['red','blue','orange','green', 'gray']
names = ['High','Low','Medium early','Medium late', 'Missing data']
world_clusters['Cluster_name'] = world_clusters['Cluster'].map(lambda x : names[x], na_action = 'ignore')
world_clusters['Color'] = world_clusters['Cluster'].map(lambda x : colors[x], na_action = 'ignore')
world_clusters.tail(2)

fig, ax = plt.subplots(figsize = (12,4))

world_clusters.plot(color = world_clusters['Color'], ax = ax)

colors = ['red','orange','green','blue', 'gray']
names = ['High','Medium early','Medium late','Low', 'Missing data']

custom_points = [Line2D([0], [0], marker="o", linestyle="none", markersize=6, color=color) for color in colors]
leg_points = ax.legend(custom_points, names, loc = 'lower left')
ax.add_artist(leg_points)
ax.set_axis_off()
plt.show()

start = 1975  # min 1965
end = 2018    # max 2018
d = {'ISO3': db['ISO3'],
     'Evolution': np.mean(db.loc[:, 'F'+str(end-4):'F'+str(end+4)], axis = 1) - np.mean(db.loc[:, 'F'+str(start-4):'F'+str(start + 4)], axis = 1)}
# As before, replace 'ADM0_A3' with 'iso_a3' if using the deprecated map
world_evol = pd.merge(world.rename(columns = {'ADM0_A3' : 'ISO3'}), pd.DataFrame(d), how = 'left')

fig, ax = plt.subplots(figsize = (12,4))
world_evol.plot('Evolution', cmap = 'coolwarm', legend = True, ax = ax, missing_kwds={'color': 'grey'})
ax.set_axis_off()
ax.set_title(f"Temperature gain since {start} (averaged over a decade)")
plt.show()

	ObjectId	Country	ISO2	ISO3	Indicator	Unit	Source	CTS_Code	CTS_Name	CTS_Full_Descriptor	...	F2013	F2014	F2015	F2016	F2017	F2018	F2019	F2020	F2021	F2022
0	1	Afghanistan, Islamic Rep. of	AF	AFG	Temperature change with respect to a baseline ...	Degree Celsius	Food and Agriculture Organization of the Unite...	ECCS	Surface Temperature Change	Environment, Climate Change, Climate Indicator...	...	1.281	0.456	1.093	1.555	1.540	1.544	0.910	0.498	1.327	2.012
1	2	Albania	AL	ALB	Temperature change with respect to a baseline ...	Degree Celsius	Food and Agriculture Organization of the Unite...	ECCS	Surface Temperature Change	Environment, Climate Change, Climate Indicator...	...	1.333	1.198	1.569	1.464	1.121	2.028	1.675	1.498	1.536	1.518

	Country	ISO3	First decade	Last decade
0	Afghanistan, Islamic Rep. of	AFG	0.2293	1.2216
1	Albania	ALB	-0.0326	1.4940

	Country	ISO3	First decade	Last decade	Cluster
0	Afghanistan, Islamic Rep. of	AFG	0.2293	1.2216	2
1	Albania	ALB	-0.0326	1.4940	3

	featurecla	scalerank	LABELRANK	SOVEREIGNT	SOV_A3	ADM0_DIF	LEVEL	TYPE	TLC	ADMIN	...	FCLASS_TR	FCLASS_ID	FCLASS_PL	FCLASS_GR	FCLASS_IT	FCLASS_NL	FCLASS_SE	FCLASS_BD	FCLASS_UA	geometry
0	Admin-0 country	1	6	Fiji	FJI	0	2	Sovereign country	1	Fiji	...	None	None	None	None	None	None	None	None	None	MULTIPOLYGON (((180.00000 -16.06713, 180.00000...
1	Admin-0 country	1	3	United Republic of Tanzania	TZA	0	2	Sovereign country	1	United Republic of Tanzania	...	None	None	None	None	None	None	None	None	None	POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...

	featurecla	scalerank	LABELRANK	SOVEREIGNT	SOV_A3	ADM0_DIF	LEVEL	TYPE	TLC	ADMIN	...	FCLASS_GR	FCLASS_IT	FCLASS_NL	FCLASS_SE	FCLASS_BD	FCLASS_UA	geometry	Cluster	Cluster_name	Color
175	Admin-0 country	1	5	Trinidad and Tobago	TTO	0	2	Sovereign country	1	Trinidad and Tobago	...	None	None	None	None	None	None	POLYGON ((-61.68000 10.76000, -61.10500 10.890...	1	Low	blue
176	Admin-0 country	1	3	South Sudan	SDS	0	2	Sovereign country	1	South Sudan	...	None	None	None	None	None	None	POLYGON ((30.83385 3.50917, 29.95350 4.17370, ...	4	Missing data	gray

Climate change: study of temperature evolutions¶

Getting familiar with the dataset¶

First plot¶

Machine learning¶

Polynomial regression¶

Clustering: evolution in 80s and last decade¶

Temperature evolution over the last few decades¶