import numpy as np
from distfit import distfit
from matplotlib import pyplot as plt


            
              X = np.random.normal(163, 10, 10000)


            
              dfit = distfit(distr='popular', n_boots=100)
results = dfit.fit_transform(X)
dfit.plot_summary(n_top=10)
plt.show()

[distfit] >INFO> fit
[distfit] >INFO> transform
[distfit] >INFO> [norm      ] [1.12 sec] [RSS: 5.7328e-05] [loc=163.028 scale=10.011]
[distfit] >INFO> [expon     ] [0.90 sec] [RSS: 0.0173364] [loc=123.632 scale=39.396]
[distfit] >INFO> [pareto    ] [56.1 sec] [RSS: 0.0192586] [loc=-0.314 scale=123.945]
[distfit] >INFO> [dweibull  ] [23.0 sec] [RSS: 0.000238248] [loc=163.293 scale=8.542]
[distfit] >INFO> [t         ] [51.8 sec] [RSS: 5.7409e-05] [loc=163.016 scale=9.998]
[distfit] >INFO> [genextreme] [173. sec] [RSS: 0.0470114] [loc=203.560 scale=6.765]   
[distfit] >INFO> [gamma     ] [33.9 sec] [RSS: 6.05656e-05] [loc=-674.146 scale=0.120]
[distfit] >INFO> [lognorm   ] [84.5 sec] [RSS: 0.000570502] [loc=106.234 scale=55.695]
[distfit] >INFO> [beta      ] [63.7 sec] [RSS: 5.58594e-05] [loc=-120.706 scale=492.311]
[distfit] >INFO> [uniform   ] [0.90 sec] [RSS: 0.012937] [loc=123.632 scale=81.663]
[distfit] >INFO> [loggamma  ] [74.7 sec] [RSS: 5.53155e-05] [loc=-1658.818 scale=274.997]
[distfit] >INFO> Compute confidence intervals [parametric]
[distfit] >INFO> Ploting Summary.
[distfit] >INFO> Bootstrap results are included..


            
              fig, ax = plt.subplots(1, 2, figsize=(20, 8))
dfit.plot(chart='PDF', n_top=1, ax=ax[0]);
dfit.plot(chart='CDF', n_top=10, ax=ax[1])
plt.show()

[distfit] >INFO> Create PDF plot for the parametric method.
[distfit] >INFO> Estimated distribution: Norm(loc:163.027510, scale:10.011438)
[distfit] >INFO> Create CDF plot for the parametric method.
[distfit] >INFO> Ploting CDF


            
              dfit = distfit(distr='loggamma', alpha=0.01, bound='both')
results = dfit.fit_transform(X)
print(dfit.model)
# dfit.save('./human_height_model.pkl')

[distfit] >INFO> fit
[distfit] >INFO> transform
[distfit] >INFO> [loggamma] [0.53 sec] [RSS: 5.53155e-05] [loc=-1658.818 scale=274.997]
[distfit] >INFO> Compute confidence intervals [parametric]

{'name': 'loggamma', 'score': 5.531547878947298e-05, 'loc': -1658.8175767223656,
            'scale': 274.9974343225563, 'arg': (754.1649974963748,),
            'params': (754.1649974963748, -1658.8175767223656, 274.9974343225563),
            'model': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000017CF3114048>,
            'bootstrap_score': 0, 'bootstrap_pass': None, 'color': '#e41a1c',
            'CII_min_alpha': 139.4535555539385,
            'CII_max_alpha': 186.0622211891416}


            
              y = [130, 160, 200]
results = dfit.predict(y, alpha=0.01, multtest='fdr_bh', todf=True)
results['df']
plt.figure();
fig, ax = plt.subplots(1, 2, figsize=(20, 8))
dfit.plot(chart='PDF', ax=ax[0]);
dfit.plot(chart='CDF', ax=ax[1])
plt.show()

[distfit] >INFO> Alpha is set to [0.01]
[distfit] >INFO> Compute confidence intervals [parametric]
[distfit] >INFO> Compute significance for 3 samples.
[distfit] >INFO> Multiple test correction method applied: [fdr_bh].
[distfit] >INFO> Create PDF plot for the parametric method.
[distfit] >INFO> Mark 2 significant regions
[distfit] >INFO> Estimated distribution: Loggamma(loc:-1658.817577, scale:274.997434)
[distfit] >INFO> Create CDF plot for the parametric method.
[distfit] >INFO> Ploting CDF
[distfit] >INFO> Mark 2 significant regions

<Figure size 640x480 with 0 Axes>


            
              dfit = distfit()
df = dfit.import_example(data='gas_spot_price')
df.head()

[distfit] >INFO> Downloading and processing [gas_spot_price] from github source.


            
              dfit.lineplot(df, xlabel='Years', ylabel='Natural gas spot price', grid=True)
plt.show()

[distfit] >INFO> Dataframe detected. Labels are derived from the index and the data is flattened.
No artists with labels found to put in legend.  Note that artists whose label start with an
underscore are ignored when legend() is called with no argument.


            
              # dfit = distfit(distr='full', n_boots=100)
dfit = distfit(distr='popular', n_boots=100)
results = dfit.fit_transform(df['price'].values)

fig, ax = plt.subplots(1,2, figsize=(25, 10))
dfit.plot(chart='PDF', n_top=10, ax=ax[0])
dfit.plot(chart='CDF', n_top=10, ax=ax[1])
plt.show()

[distfit] >INFO> fit
[distfit] >INFO> transform
[distfit] >INFO> [norm      ] [1.06 sec] [RSS: 0.169681] [loc=4.235 scale=2.195]
[distfit] >INFO> [expon     ] [1.00 sec] [RSS: 0.257492] [loc=1.050 scale=3.185] 
[distfit] >INFO> [pareto    ] [27.6 sec] [RSS: 0.739523] [loc=0.001 scale=1.049]  
[distfit] >INFO> [dweibull  ] [19.3 sec] [RSS: 0.124167] [loc=3.695 scale=1.675]    
[distfit] >INFO> [t         ] [37.0 sec] [RSS: 0.131213] [loc=3.779 scale=1.566]
[distfit] >INFO> [genextreme] [65.2 sec] [RSS: 0.0344589] [loc=3.091 scale=1.243]     
[distfit] >INFO> [gamma     ] [11.2 sec] [RSS: 0.0545076] [loc=1.047 scale=1.288]
[distfit] >INFO> [lognorm   ] [55.4 sec] [RSS: 0.0316565] [loc=0.994 scale=2.636]  
[distfit] >INFO> [beta      ] [76.9 sec] [RSS: 0.0545361] [loc=1.047 scale=25085.315]
[distfit] >INFO> [uniform   ] [1.05 sec] [RSS: 0.525032] [loc=1.050 scale=22.810]  
[distfit] >INFO> [loggamma  ] [49.2 sec] [RSS: 0.172759] [loc=-637.212 scale=87.927]
[distfit] >INFO> Compute confidence intervals [parametric]
[distfit] >INFO> Create PDF plot for the parametric method.
[distfit] >INFO> Estimated distribution: Lognorm(loc:0.994065, scale:2.636157)
[distfit] >INFO> Create CDF plot for the parametric method.
[distfit] >INFO> Ploting CDF


            
              fig, ax = plt.subplots(1,2, figsize=(25, 10))
dfit.plot_summary(ax=ax[0])
dfit.qqplot(df['price'].values, n_top=10, ax=ax[1])
plt.show()

[distfit] >INFO> Ploting Summary.
[distfit] >INFO> Bootstrap results are included..


            
              dfit.predict(df['price'].values, alpha=0.05, multtest=None)
dfit.lineplot(df['price'], labels=df.index)

[distfit] >INFO> Alpha is set to [0.05]
[distfit] >INFO> Compute confidence intervals [parametric]
[distfit] >INFO> Compute significance for 6555 samples.

(<Figure size 2500x1200 with 1 Axes>,
 <AxesSubplot:title={'center':'\nlognorm(s=0.64734, loc=0.994065, scale=2.63616)'},
 xlabel='x-axes', ylabel='y-axes'>)

	price
date
2023-02-07	2.35
2023-02-06	2.17
2023-02-03	2.40
2023-02-02	2.67
2023-02-01	2.65

Detección de valores atípicos mediante el ajuste de distribución en conjuntos de datos univariados

¿Anomalía o novedad?

Los valores atípicos pueden clasificarse en tres categorías.

Detección de novedad para variables aleatorias continuas.

1. Determine los archivos PDF que mejor se ajusten a la altura humana.

2: Inspección visual de los archivos PDF que mejor se ajustan.

Paso 3: Decida utilizando también las propiedades del PDF.

Paso 4. Predicciones para nuevas muestras no vistas.

Detección de anomalías en datos del mundo real.

Inspección visual del conjunto de datos.

Detección de valores atípicos globales y contextuales.

Cometarios.