import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats


            
              df = pd.read_csv('life_expectancy_vs_gdp.csv')
df.rename(columns={'Life expectancy at birth (years)': 'LEB'}, inplace=True)
df.replace('United States', 'USA', inplace=True)
countries = df['Country'].unique().tolist()
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown',
'tab:pink', 'tab:olive']
custom_palette = dict([(country, color) for country, color in zip(countries, colors)])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  96 non-null     object 
 1   Year     96 non-null     int64  
 2   LEB      96 non-null     float64
 3   GDP      96 non-null     float64
dtypes: float64(2), int64(1), object(1)
memory usage: 3.1+ KB

df


            
              df.describe()


            
              df.Country.value_counts()

Chile       16
China       16
Germany     16
Mexico      16
USA         16
Zimbabwe    16
Name: Country, dtype: int64


            
              # Define figure grid and its title
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(10,6))
axs = [ax1, ax2, ax3, ax4, ax5, ax6]
fig.suptitle('Correlation between GDP and Life Expectancy per Country')
for i, (ax, country) in enumerate(zip(axs, countries)):
    data = df[df['Country'] == country]
    # Plot each graph separated
    ax = plt.subplot(2, 3, i + 1)
    g = sns.regplot(data, x='GDP', y='LEB', ci=False, color=colors[i])
    plt.annotate(country, xy=(0.05, 0.9), xycoords='axes fraction',
    bbox=dict(boxstyle="round", fc=colors[i], ec=colors[i], alpha=0.5))
    # Find and plot Pearson Correlation (r) for each graph
    r = stats.pearsonr(data.GDP, data.LEB)[0]
    plt.annotate(f'r = {r :.2f}', xy=(0.05, 0.8), xycoords='axes fraction')
    # Transform X-axis in Trillion USD and normalize their ticks
    xticks = g.get_xticks()
    # xlabels = [tick / 1e12 for tick in xticks]
    xlabels = [tick / 1e10 for tick in xticks]
    plt.xticks(ticks=xticks[1:-1], labels=xlabels[1:-1])
    # Transform Y-axis labels in integers
    yticks = [*set([round(tick) for tick in g.get_yticks()])]
    g.set(yticks=yticks)
    # Remove X and Y labels from graphs in the middle
    if ax in [ax1, ax4]:
        plt.ylabel('Life Expectancy at Birth (years)')
    else:
        plt.ylabel('')
    if ax in [ax4, ax5, ax6]:
        plt.xlabel('GDP (Trillion USD)')
    else:
        plt.xlabel('')


            
              # Define figure grid and its title
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(10, 8))
axs = [ax1, ax2, ax3, ax4, ax5, ax6]
fig.suptitle('Evolution of Life Expectancy and GDP per Year in each Country')
for i, (ax, country) in enumerate(zip(axs, countries)):
    data = df[df['Country'] == country]
  
    # Plot each graph separated
    g1 = plt.subplot(3, 2, i + 1)
    plt.annotate(country, xy=(0.05, 0.9), xycoords='axes fraction',
    bbox=dict(boxstyle="round", fc=colors[i], ec=colors[i], alpha=0.5))
    # Plot Life Expectancy x Year
    g1.plot('Year', 'LEB', data=data, color=colors[-2], marker='o')
    g1.set_xticks(range(2000, 2016, 3))
    # Plot GDP x Year
    g2 = g1.twinx()
    g2.plot('Year', 'GDP', data=data, color=colors[-1], marker='o')
    # Transform LEB-axis labels in integers
    leb_ticks = [*set([round(tick) for tick in g1.get_yticks()])]
    g1.set(yticks=leb_ticks)
    # Transform GDP-axis in Trillion USD and normalize their ticks
    gdp_ticks = g2.get_yticks()
    gdp_labels = [tick / 1e12 for tick in gdp_ticks]
    gdp_labels = [int(tick) if tick % int(tick) == 0 else tick for tick in gdp_labels]
    g2.set(yticks=gdp_ticks[1:-1], yticklabels=gdp_labels[1:-1])
    # Set Y labels' colors
    g1.tick_params(axis='y', colors=colors[-2])
    g2.spines['left'].set_color(colors[-2])
    g2.tick_params(axis='y', colors=colors[-1])
    g2.spines['right'].set_color(colors[-1])
    # Remove top spine
    g1.spines['top'].set_visible(False)
    g2.spines['top'].set_visible(False)
    # Set X and Y labels from graphs in the corners
    if ax in [ax1, ax3, ax5]:
        g1.set_ylabel('Life Expectancy at Birth (years)', color=colors[-2])
    if ax in [ax5, ax6]:
        g1.set_xlabel('Year')
    if ax in [ax2, ax4, ax6]:
        g2.set_ylabel('GDP (Trillion USD)', color=colors[-1])
    if ax in [ax1, ax2, ax3, ax4]:
        g1.set_xticklabels('')


            
              # Define figure grid and its title
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5))
plt.suptitle('GDP Averages & Evolution per Year')
# Find average GDP for each country
gdp_means = df.groupby('Country')['GDP'].mean().sort_values(ascending=False)
# Bar plot
ax1 = plt.subplot(121)
sns.barplot(x=gdp_means.index, y=gdp_means.values, palette=custom_palette)
# Change Y-axis' label to Trillion USD
ax1.set_ylabel('Average GDP (Trillion USD)')
yticks = ax1.get_yticks()
yticklabels = [round(tick / 1e12) for tick in yticks]
ax1.set(yticks=yticks[:-1], yticklabels=yticklabels[:-1])
# Include values for each bar
for label in ax1.get_xticklabels():
    xtick = label.get_position()[0]
    xlabel = label.get_text()
    ax1.text(x=xtick, y=gdp_means[xlabel] + 0.05e12, s=f"${gdp_means[xlabel] / 1e12 :.2f}T",
    horizontalalignment='center')
for country in countries:
    data = df[df['Country'] == country]
    # Plot lines on second graph
    ax2 = plt.subplot(122)
    plt.plot(data.Year, data.GDP, marker='o', label=country)
# Change Y-axis' labels and tick values
ax2.set_ylabel('GDP (Trillion USD)')
ax2.set_yticks(ticks=np.arange(0, 18.5e12, 2.5e12), labels=np.arange(0, 18.5, 2.5))
ax2.set_xlabel('Year')
ax2.set_xticks(range(2000, 2016, 3));


            
              # Define figure grid and its title
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5))
plt.suptitle('Life Expectancy Averages & Evolution per Year')
# Find average Life Expectancy for each country
leb_means = df.groupby('Country')['LEB'].mean().sort_values(ascending=False)
# Bar plot
ax1 = plt.subplot(121)
sns.barplot(x=leb_means.index, y=leb_means.values, palette=custom_palette)
ax1.set_ylabel('Average Life Expectancy at Birth (years)')
# Include values for each bar
for label in ax1.get_xticklabels():
    xtick = label.get_position()[0]
    xlabel = label.get_text()
    ax1.text(x=xtick, y=leb_means[xlabel] + 0.5, s=f"{leb_means[xlabel] :.1f}",
    horizontalalignment='center')
# Line chart
for country in countries:
    data = df[df['Country'] == country]
    # Plot lines on second graph
    ax2 = plt.subplot(122)
    plt.plot(data.Year, data.LEB, marker='o')
ax2.set_ylabel('Life Expectancy at birth (years)')
ax2.set_xlabel('Year');


            
              # Define figure grid and its title
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('Evolution of GDP & Life Expectancy Distributions per Year')
# Box plot: GDP per Year
axs[0] = plt.subplot(121)
g1 = sns.boxplot(data=df, x='Year', y='GDP')
g1.set_ylabel('GDP (Trillion USD)')
plt.xticks(rotation=45)
# Change Y-axis labels to the Trillion scale
yticks = g1.get_yticks()
ylabels = [tick / 1e12 for tick in yticks]
plt.yticks(ticks=yticks[1:-1], labels=ylabels[1:-1])
# Show median values in the graph
g1_medians = df.groupby(df['Year'])['GDP'].median()
g1_medians = [median / 1e12 for median in g1_medians]
for xtick in g1.get_xticks():
    g1.text(xtick, g1_medians[xtick] * 1.1e12, f'{g1_medians[xtick] :.2f}',
    horizontalalignment='center',size='x-small',color='w',weight='semibold')
# Box plot: Life Expectancy at Birth per Year
axs[1] = plt.subplot(122)
g2 = sns.boxplot(data=df, x='Year', y='LEB')
g2.set_ylabel('Life Expectancy at Birth (years)')
plt.xticks(rotation=45)
# Show median values in the graph
g2_medians = df.groupby(df['Year'])['LEB'].median()
for xtick in g2.get_xticks():
    g2.text(xtick, g2_medians.iloc[xtick] + 0.3, f'{g2_medians.iloc[xtick] :.1f}',
    horizontalalignment='center',size='x-small',color='w',weight='semibold');


            
              df_growth = df.sort_values(by='Year').groupby('Country')\
            .agg(gdp_growth=('GDP', lambda x: (x.iloc[-1] - x.iloc[0]) / x.iloc[0]),
                 leb_growth=('LEB', lambda x: (x.iloc[-1] - x.iloc[0]) / x.iloc[0]))
display(df_growth)


            
              # Define figure grid and its title
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5))
fig.suptitle('GDP & Life Expectancy Growth Between 2000-2015')
# Plot GDP Bar Graph
ax1 = plt.subplot(121)
order =
[country for country in df_growth.sort_values(by='gdp_growth', ascending=False).index]
sns.barplot(x=df_growth.index, y=df_growth.gdp_growth, palette=custom_palette, order=order)
# Set Y-axis' labels to percentage
ax1.set_ylabel('Gross Domestic Product (GDP)')
yticks = ax1.get_yticks()
ylabels = [f'{tick * 100 :.0f}%' for tick in yticks]
ax1.set(yticks=yticks[:-1], yticklabels=ylabels[:-1])
# Include values for each bar
for label in ax1.get_xticklabels():
    xtick = label.get_position()[0]
    xlabel = label.get_text()
    ax1.text(x=xtick, y=df_growth.loc[xlabel, 'gdp_growth'] + 0.05,
    s=f"{df_growth.loc[xlabel, 'gdp_growth'] * 100 :.1f}%", horizontalalignment='center')
# Plot Life Expectancy Bar Graph
ax2 = plt.subplot(122)
order =
[country for country in df_growth.sort_values(by='leb_growth', ascending=False).index]
sns.barplot(x=df_growth.index, y=df_growth.leb_growth, palette=custom_palette, order=order)
# Set Y-axis' labels to percentage
ax2.set_ylabel('Life Expectancy at Birth')
yticks = ax2.get_yticks()
ylabels = [f'{tick * 100 :.0f}%' for tick in yticks]
ax2.set(yticks=yticks[:-1], yticklabels=ylabels[:-1])
# Include values for each bar
for label in ax2.get_xticklabels():
    xtick = label.get_position()[0]
    xlabel = label.get_text()
    ax2.text(x=xtick, y=df_growth.loc[xlabel, 'leb_growth'] + 0.002,
    s=f"{df_growth.loc[xlabel, 'leb_growth'] * 100 :.1f}%", horizontalalignment='center');

	Country	Year	LEB	GDP
0	Chile	2000	76.8	2.255210e+10
1	Chile	2001	76.9	2.377458e+10
2	Chile	2002	77.4	2.482215e+10
3	Chile	2003	77.4	2.503393e+10
4	Chile	2004	77.6	2.663641e+10
...	...	...	...	...
91	Zimbabwe	2011	53.3	3.433326e+09
92	Zimbabwe	2012	55.6	3.647860e+09
93	Zimbabwe	2013	57.5	3.687255e+09
94	Zimbabwe	2014	58.8	3.775846e+09
95	Zimbabwe	2015	59.6	3.886050e+09

	Year	LEB	GDP
count	96.000000	96.000000	9.600000e+01
mean	2007.500000	72.353125	1.093783e+12
std	4.633971	10.931759	1.318816e+12
min	2000.000000	42.000000	2.577216e+09
25%	2003.750000	74.100000	2.480371e+10
50%	2007.500000	76.800000	3.951206e+11
75%	2011.250000	78.625000	2.540505e+12
max	2015.000000	80.900000	3.885245e+12

	gdp_growth	leb_growth
Country
Chile	0.310557	0.037760
China	2.211223	0.070932
Germany	1.038317	0.032010
Mexico	0.432843	0.014946
USA	0.329850	0.027344
Zimbabwe	0.507848	0.333333

Análisis Exploratorio de Datos (Caso 4)

Información del conjunto de datos

Objetivos del proyecto

Análisis

Importar bibliotecas de Python

Se cargan y limpian los datos.

Visualice el marco de datos.

Visualización y análisis de datos.

Gráficos de dispersión individuales.

Gráficos de líneas individuales.

Diagramas de barras y gráficos de líneas agrupados

Comparación de grupos de PIB.

Comparación de grupos de esperanza de vida

Diagramas de caja conjugados.

Siempre hay algo más.

Codifiquemos un poco más

Grafiquen juntos para comparar.

Comentarios.