
    !gm'                     	   d Z ddlmZ ddlZd Zd ZddZ G d d      Ze	dk(  rddl
mZ d	Zej                  j                  e      Zd
gZdev r ee        e eed              e eed              ej&                  ej)                         ej+                               Z eee      Z ej0                  ee        ej0                   eee      edz           ej2                   ej4                  e       ej4                   ee            d      Z ej8                  eD  cg c]  } ej;                  |       d    c}       Z ej>                           ej0                  ee        ee      Z  ej>                           ej0                  ee         ej>                           ejB                  edd  ejD                  e        ejD                  e      z          ej&                  ej)                         ej+                         d      Z# ej>                           ejB                  e#dd  ejD                   ee#             ejD                  e#      z          ej4                  e      Z$e$ddedz     Z% ej>                           ejB                  e%dd  ejD                   ee%             ejD                  e%      z          ee      Z& ee&jO                                 ee&jQ                  e&jR                                ee&jU                  g d              ee&jQ                  g d              ej&                  ej)                         ej+                         d      Z ej2                   ej4                  e       ej4                   ee            d      Z ee      Z  ej>                           ej0                  ee         ej2                  e ed      Z+ e+e       Z, ej0                  e,e         ejZ                   ej4                   ee             ej4                  e      dd      Z. e.e       Z/ ej0                  e/e         ed        ed ejD                  e,      j)                                 ed ejD                  e       j)                                yyc c} w )a 
  
from David Huard's scipy sandbox, also attached to a ticket and
in the matplotlib-user mailinglist  (links ???)


Notes
=====

out of bounds interpolation raises exception and would not be completely
defined ::

>>> scoreatpercentile(x, [0,25,50,100])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.
>>> percentileofscore(x, [-50, 50])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.


idea
====

histogram and empirical interpolated distribution
-------------------------------------------------

dual constructor
* empirical cdf : cdf on all observations through linear interpolation
* binned cdf : based on histogram
both should work essentially the same, although pdf of empirical has
many spikes, fluctuates a lot
- alternative: binning based on interpolated cdf : example in script
* ppf: quantileatscore based on interpolated cdf
* rvs : generic from ppf
* stats, expectation ? how does integration wrt cdf work - theory?

Problems
* limits, lower and upper bound of support
  does not work or is undefined with empirical cdf and interpolation
* extending bounds ?
  matlab has pareto tails for empirical distribution, breaks linearity

empirical distribution with higher order interpolation
------------------------------------------------------

* should work easily enough with interpolating splines
* not piecewise linear
* can use pareto (or other) tails
* ppf how do I get the inverse function of a higher order spline?
  Chuck: resample and fit spline to inverse function
  this will have an approximation error in the inverse function
* -> does not work: higher order spline does not preserve monotonicity
  see mailing list for response to my question
* pmf from derivative available in spline

-> forget this and use kernel density estimator instead


bootstrap/empirical distribution:
---------------------------------

discrete distribution on real line given observations
what's defined?
* cdf : step function
* pmf : points with equal weight 1/nobs
* rvs : resampling
* ppf : quantileatscore on sample?
* moments : from data ?
* expectation ? sum_{all observations x} [func(x) * pmf(x)]
* similar for discrete distribution on real line
* References : ?
* what's the point? most of it is trivial, just for the record ?


Created on Monday, May 03, 2010, 11:47:03 AM
Author: josef-pktd, parts based on David Huard
License: BSD

    Nc                     t        j                  |      }t        |       }t        j                  t        j
                  |      t        j
                  |             } ||dz        S )zReturn the score at the given percentile of the data.

    Example:
        >>> data = randn(100)
            >>> scoreatpercentile(data, 50)

        will return the median of sample `data`.
          Y@)nparrayempiricalcdfinterpolateinterp1dsort)data
percentilepercdfinterpolators        c/var/www/dash_apps/app1/venv/lib/python3.12/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyscoreatpercentiler   V   sM     ((:
C
t
C''bggdmDLD!!    c                     t        |       }t        j                  t        j                  |       t        j                  |            } ||      dz  S )aD  Return the percentile-position of score relative to data.

    score: Array of scores at which the percentile is computed.

    Return percentiles (0-100).

    Example
            r = randn(50)
        x = linspace(-2,2,100)
        percentileofscore(r,x)

    Raise an error if the score is outside the range of data.
    r   )r   r   r	   r   r
   )r   scorer   r   s       r   percentileofscorer   d   s@     t
C''rwws|DLt##r   c                 j   t        j                  t        j                  |             dz   }t        |       }|j                         }|dk(  r
|dz
  |z  }|S |dk(  r
||dz   z  }|S |dk(  r
|dz
  |z  }|S |dk(  r|dz
  |dz   z  }|S |d	k(  r|dz
  |d
z   z  }|S |dk(  r|dz
  |dz   z  }|S t	        d      )a  Return the empirical cdf.

    Methods available:
        Hazen:       (i-0.5)/N
            Weibull:     i/(N+1)
        Chegodayev:  (i-.3)/(N+.4)
        Cunnane:     (i-.4)/(N+.2)
        Gringorten:  (i-.44)/(N+.12)
        California:  (i-1)/N

    Where i goes from 1 to N.
          ?hazen      ?weibull
california
chegodayev333333?皙?cunnane皙?
gringorten)\(?Q?[Unknown method. Choose among Weibull, Hazen,Chegodayev, Cunnane, Gringorten and California.)r   argsortlenlower
ValueError)r   methodiNr   s        r   r   r   v   s	    	

2::d#$r)AD	A\\^Fuai J 
9	2h J 
<	tQh J 
<	tadm J 
9	tadm J 
<	uquo
 J  K L 	Lr   c                   2    e Zd ZdZd ZddZd Zd Zd	dZy)
HistDistzDistribution with piecewise linear cdf, pdf is step function

    can be created from empiricial distribution or from a histogram (not done yet)

    work in progress, not finished


    c                 R   t        j                  |      | _        t        j                  | j                  j	                         | j                  j                         g      | _        t        j                  |      }||   | _        t        j                  |      | _	        | j                         }t        j                  |      | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )N)r   
atleast_1dr   r   minmaxbinlimitr%   _datasortedrankingr   r
   _empcdfsortedr   r	   cdfintpppfintp)selfr   sortindr   s       r   __init__zHistDist.__init__   s    MM$'	$))--/499==?!CD**T"=zz'*!WWS\"++D,<,<d>P>PQ"++D,>,>@P@PQr   Nc                    || j                   }| j                  }n+t        j                  t        j                  |            dz   }t	        |      }|j                         }|dk(  r
|dz
  |z  }|S |dk(  r
||dz   z  }|S |dk(  r
|dz
  |z  }|S |dk(  r|dz
  |dz   z  }|S |d	k(  r|dz
  |d
z   z  }|S |dk(  r|dz
  |dz   z  }|S t        d      )aA  Return the empirical cdf.

        Methods available:
            Hazen:       (i-0.5)/N
                Weibull:     i/(N+1)
            Chegodayev:  (i-.3)/(N+.4)
            Cunnane:     (i-.4)/(N+.2)
            Gringorten:  (i-.44)/(N+.12)
            California:  (i-1)/N

        Where i goes from 1 to N.
        r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   )r   r4   r   r%   r&   r'   r(   )r8   r   r)   r*   r+   r   s         r   r   zHistDist.empiricalcdf   s     <99DA

2::d+,r1AIWS5!)C 
 y QrT(C 
 |#R4(C 
 |#R4!B$-C 
 y R4!B$-C 
 |#S51S5/C
 
  O P Pr   c                 $    | j                  |      S z&
        this is score in dh

        )r6   )r8   r   s     r   cdf_empzHistDist.cdf_emp   s    
 ||E""r   c                 $    | j                  |      S r=   )r7   )r8   quantiles     r   ppf_empzHistDist.ppf_emp   s    
 ||H%%r   c                 T   t        | j                        }|dk(  r/| j                  d      | j                  d      z
  }d|z  |dz  z  }n-|dk(  r(dt        j                  | j                        z  |dz  z  }t        j
                  | j                        z  | _        | j                  S )zFind the optimal number of bins and update the bin countaccordingly.
        Available methods : Freedman
                            Scott
        Freedman      ?      ?   gUUUUUUտScottgQ@)r&   r   rA   r   stdptpr2   nbin)r8   r)   nobsIQRwidths        r   optimize_binningzHistDist.optimize_binning   s     499~:,,t$t||D'99CsFD5M)EW_266$)),,te}<EVVDMM*50	yyr   )NHazen)rC   )	__name__
__module____qualname____doc__r:   r   r>   rA   rN    r   r   r-   r-      s"    
R&R#&r   r-   __main__d   rF      r   2   )k   )rE   r   rD   )g      g      пr   rE   r   i     gQ?)rY   sznegative densityz(np.diff(ppfs)).min()z(np.diff(cdf_ongrid)).min())rO   )0rS   scipy.interpolater   numpyr   r   r   r   r-   rP   matplotlib.pyplotpyplotpltrK   randomrandnxexamplesprintlinspacer0   r1   xsuppposplotInterpolatedUnivariateSpliner
   empr   derivativespdfempfigure
cdf_ongridstepdiffxsupp2xsoxshistdrN   r>   r2   rA   r7   ppfsUnivariateSplineppfempppfe)xis   0r   <module>r}      s  Qd ( "$$!H` `H z#D
		AsHH}Q3'(2&'AEEGQUUW-5) 	"1c*CE2 5K44WRWWQZUV@X[\]EBb3??2.q1BC

vZ



# 	

sGBGGJ/>? QUUWaeegr2

WRWWS[1'"''&/AB bggaj47^

CRR)'"''"+56 QKE	%
 
 
"#	%--
'(	%--)
*+	%--3
45 BKK#.E000GBGGLQRO<TWXYC UJCJJLCHHUJ6k66z%!LG:DCHHT: (;''Q(@aSWXF*DCHHT:	
	
!GBGGDM#6#6#89	
''"''**=)B)B)DES * Cs   +R;