import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')


              
                haberman = pd.read_csv("haberman.csv")


              
                print (haberman.shape)

(306, 4)


              
                print(haberman.columns)

Index(['age', 'year', 'nodes', 'status'], dtype='object')


              
                haberman['status'].value_counts()

1    225
2     81
Name: status, dtype: int64


              
                sns.FacetGrid(haberman, hue="status", size=5) \
   .map(sns.distplot, "age") \
   .add_legend();
plt.ylabel('% of patient ')
plt.title("Histogram of age")
plt.show();


              
                sns.FacetGrid(haberman, hue="status", size=5) \
   .map(sns.distplot, "year") \
   .add_legend();
plt.ylabel('% of patient ')
plt.title("Histogram of year")
plt.show();


              
                sns.FacetGrid(haberman, hue="status", size=5) \
   .map(sns.distplot, "nodes") \
   .add_legend();
plt.ylabel('% of patient ')
plt.title("Histogram of nodes")
plt.show();


              
                survived = haberman.loc[haberman["status"] == 1]
not_survived = haberman.loc[haberman["status"] == 2]
label = ["pdf of class 1", "cdf of class 1","pdf of class 2", "cdf of class 2"]
counts, bin_edges = np.histogram(survived["nodes"], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.title("pdf VS cdf for nodes")
plt.xlabel('nodes')
plt.ylabel('% of patient ')
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(label)
counts, bin_edges = np.histogram(not_survived["nodes"], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(label)


plt.show();


              
                survived = haberman.loc[haberman["status"] == 1]
not_survived = haberman.loc[haberman["status"] == 2]
label = ["pdf of class 1", "cdf of class 1","pdf of class 2", "cdf of class 2"]
counts, bin_edges = np.histogram(survived["year"], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.title("pdf VS cdf for year")
plt.xlabel('year')
plt.ylabel('% of patient ')
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(label)
counts, bin_edges = np.histogram(not_survived["year"], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(label)
plt.show();


              
                survived = haberman.loc[haberman["status"] == 1]
not_survived = haberman.loc[haberman["status"] == 2]
label = ["pdf of class 1", "cdf of class 1","pdf of class 2", "cdf of class 2"]
counts, bin_edges = np.histogram(survived["age"], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.title("pdf VS cdf for age")
plt.xlabel('age')
plt.ylabel('% of patient ')
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(label)
counts, bin_edges = np.histogram(not_survived["age"], bins=10, density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(label)
plt.show();


              
                sns.boxplot(x='status',y='nodes', data=haberman)
plt.title("Boxplot of nodes")
plt.show()


              
                sns.boxplot(x='status',y='age', data=haberman)
plt.title("Boxplot of age")
plt.show()


              
                sns.boxplot(x='status',y='year', data=haberman)
plt.title("Boxplot of year")
plt.show()


              
                sns.violinplot(x = "status", y = "age",  data = haberman)
plt.title("Voilin plots of age")
plt.show()


              
                sns.violinplot(x = "status", y = "nodes",  data = haberman)
plt.title("Voilin plots of nodes")
plt.show()


              
                haberman.plot(kind='scatter', x='age', y='year') ;
plt.title("scatter plots of age")
plt.show()


              
                sns.set_style("whitegrid");
sns.FacetGrid(haberman, hue="status", size=4) \
   .map(plt.scatter, "age", "year") \
   .add_legend();
plt.title("scatter plots of age,year")
plt.show();


              
                sns.set_style("whitegrid");
sns.FacetGrid(haberman, hue="status", size=4) \
   .map(plt.scatter, "age", "nodes") \
   .add_legend();
plt.title("scatter plots of age,nodes")
plt.show();


              
                sns.set_style("whitegrid");
sns.FacetGrid(haberman, hue="status", size=4) \
   .map(plt.scatter, "year", "nodes") \
   .add_legend();
plt.title("scatter plots of year,nodes")
plt.show();


              
                plt.close();
sns.set_style("whitegrid");
sns.pairplot(haberman, hue="status", vars=['age', 'year','nodes']);
plt.show()

¿Por qué el análisis exploratorio de datos?

Definición :

¿Qué es el análisis de datos exploratorios (EDA)?

Análisis de datos exploratorios explicado utilizando un conjunto de datos de muestra:

Análisis univariado: PDF¶

Análisis Univariante: CDF¶

Análisis univariante: Diagrama de caja¶

Análisis univariante: gráficos de Voilin¶