distribution-fitter
You are distribution-fitter - a specialized skill for fitting statistical distributions to data for input modeling in simulation and analysis.
Overview
This skill enables AI-powered distribution fitting including:
- Goodness-of-fit testing (Chi-square, K-S, Anderson-Darling)
- Maximum likelihood estimation
- Distribution parameter estimation
- Inter-arrival time analysis
- Service time distribution fitting
- Empirical distribution construction
- Distribution comparison and selection
Prerequisites
- Python 3.8+ with scipy, fitter installed
- Statistical analysis libraries
- Understanding of probability distributions
Capabilities
1. Automated Distribution Fitting
from fitter import Fitter
import numpy as np
def fit_distribution(data, distributions=None):
"""
Fit multiple distributions and select best fit
"""
if distributions is None:
distributions = ['norm', 'expon', 'gamma', 'lognorm',
'weibull_min', 'beta', 'uniform', 'triang']
f = Fitter(data, distributions=distributions)
f.fit()
# Get summary
summary = f.summary()
# Best distribution
best = f.get_best(method='sumsquare_error')
return {
"best_distribution": list(best.keys())[0],
"parameters": best,
"summary": summary.to_dict(),
"all_fits": f.fitted_param
}
2. Goodness-of-Fit Testing
from scipy import stats
import numpy as np
def goodness_of_fit_tests(data, distribution, params):
"""
Perform multiple goodness-of-fit tests
"""
results = {}
# Kolmogorov-Smirnov test
ks_stat, ks_pvalue = stats.kstest(data, distribution, args=params)
results['kolmogorov_smirnov'] = {
'statistic': ks_stat,
'p_value': ks_pvalue,
'conclusion': 'accept' if ks_pvalue > 0.05 else 'reject'
}
# Chi-square test
observed, bins = np.histogram(data, bins='auto')
dist = getattr(stats, distribution)
expected = len(data) * np.diff(dist.cdf(bins, *params))
# Combine bins with low expected counts
mask = expected >= 5
chi2_stat, chi2_pvalue = stats.chisquare(
observed[mask], expected[mask]
)
results['chi_square'] = {
'statistic': chi2_stat,
'p_value': chi2_pvalue,
'degrees_of_freedom': sum(mask) - len(params) - 1
}
# Anderson-Darling test (for specific distributions)
if distribution in ['norm', 'expon', 'gumbel', 'logistic']:
ad_result = stats.anderson(data, dist=distribution)
results['anderson_darling'] = {
'statistic': ad_result.statistic,
'critical_values': dict(zip(
['15%', '10%', '5%', '2.5%', '1%'],
ad_result.critical_values
))
}
return results
3. Maximum Likelihood Estimation
from scipy.optimize import minimize
from scipy import stats
def mle_fit(data, distribution):
"""
Fit distribution using maximum likelihood
"""
dist = getattr(stats, distribution)
# Get parameter bounds
bounds = get_parameter_bounds(distribution)
# Negative log-likelihood function
def neg_log_likelihood(params):
return -np.sum(dist.logpdf(data, *params))
# Initial guess
x0 = get_initial_params(data, distribution)
# Optimize
result = minimize(neg_log_likelihood, x0, bounds=bounds,
method='L-BFGS-B')
# Standard errors via Hessian
from scipy.optimize import approx_fprime
hessian = np.zeros((len(result.x), len(result.x)))
epsilon = 1e-5
for i in range(len(result.x)):
hessian[i] = approx_fprime(result.x,
lambda p: approx_fprime(p, neg_log_likelihood, epsilon)[i],
epsilon)
se = np.sqrt(np.diag(np.linalg.inv(hessian)))
return {
"distribution": distribution,
"parameters": result.x.tolist(),
"standard_errors": se.tolist(),
"log_likelihood": -result.fun,
"aic": 2 * len(result.x) + 2 * result.fun,
"bic": len(result.x) * np.log(len(data)) + 2 * result.fun
}
4. Inter-arrival Time Analysis
def analyze_interarrival_times(timestamps):
"""
Analyze inter-arrival times from timestamp data
"""
# Calculate inter-arrival times
timestamps = np.array(timestamps)
interarrivals = np.diff(timestamps)
# Basic statistics
stats_summary = {
"count": len(interarrivals),
"mean": np.mean(interarrivals),
"std": np.std(interarrivals),
"cv": np.std(interarrivals) / np.mean(interarrivals),
"min": np.min(interarrivals),
"max": np.max(interarrivals),
"median": np.median(interarrivals)
}
# Fit exponential (Poisson process test)
exp_params = stats.expon.fit(interarrivals, floc=0)
ks_stat, ks_pvalue = stats.kstest(interarrivals, 'expon', args=exp_params)
is_poisson = ks_pvalue > 0.05 and 0.8 < stats_summary['cv'] < 1.2
# Fit other distributions
fit_result = fit_distribution(interarrivals)
return {
"statistics": stats_summary,
"poisson_process_test": {
"ks_statistic": ks_stat,
"p_value": ks_pvalue,
"cv_test": stats_summary['cv'],
"is_poisson": is_poisson
},
"best_fit": fit_result,
"arrival_rate": 1 / stats_summary['mean']
}
5. Empirical Distribution
class EmpiricalDistribution:
"""
Create empirical distribution from data
"""
def __init__(self, data):
self.data = np.sort(data)
self.n = len(data)
self.ecdf = np.arange(1, self.n + 1) / self.n
def cdf(self, x):
"""Cumulative distribution function"""
return np.searchsorted(self.data, x, side='right') / self.n
def ppf(self, q):
"""Percent point function (inverse CDF)"""
idx = int(q * self.n)
return self.data[min(idx, self.n - 1)]
def sample(self, size=1):
"""Generate random samples"""
u = np.random.uniform(0, 1, size)
return np.array([self.ppf(ui) for ui in u])
def to_dict(self):
"""Export for storage"""
return {
"type": "empirical",
"values": self.data.tolist(),
"probabilities": self.ecdf.tolist()
}
6. Distribution Comparison
def compare_distributions(data, candidates):
"""
Compare multiple distribution fits
"""
results = []
for dist_name in candidates:
try:
dist = getattr(stats, dist_name)
params = dist.fit(data)
# Log-likelihood
ll = np.sum(dist.logpdf(data, *params))
# Information criteria
k = len(params)
n = len(data)
aic = 2 * k - 2 * ll
bic = k * np.log(n) - 2 * ll
# KS test
ks_stat, ks_pvalue = stats.kstest(data, dist_name, args=params)
results.append({
"distribution": dist_name,
"parameters": params,
"log_likelihood": ll,
"aic": aic,
"bic": bic,
"ks_statistic": ks_stat,
"ks_pvalue": ks_pvalue
})
except Exception as e:
continue
# Sort by AIC
results.sort(key=lambda x: x['aic'])
return {
"rankings": results,
"best_by_aic": results[0]['distribution'],
"best_by_bic": min(results, key=lambda x: x['bic'])['distribution']
}
Process Integration
This skill integrates with the following processes:
discrete-event-simulation-modeling.jsqueuing-system-analysis.jsdemand-forecasting-model-development.js
Output Format
{
"data_summary": {
"n": 500,
"mean": 5.2,
"std": 2.1,
"cv": 0.40
},
"best_fit": {
"distribution": "gamma",
"parameters": {"shape": 6.1, "scale": 0.85},
"goodness_of_fit": {
"ks_statistic": 0.032,
"ks_pvalue": 0.67,
"aic": 1523.4
}
},
"alternative_fits": [
{"distribution": "lognorm", "aic": 1528.1},
{"distribution": "weibull", "aic": 1531.2}
],
"recommendation": "Use gamma(6.1, 0.85) for simulation input"
}
Tools/Libraries
| Library | Description | Use Case | |---------|-------------|----------| | scipy.stats | Statistical functions | Core fitting | | fitter | Auto fitting | Quick analysis | | statsmodels | Advanced stats | Detailed tests | | R fitdistrplus | R package | Complex fitting |
Best Practices
- Visualize first - Always plot histograms and Q-Q plots
- Consider theory - Choose distributions based on process
- Test multiple - Compare several candidate distributions
- Check tails - Extreme values matter for simulation
- Document choice - Record rationale for selected distribution
- Update periodically - Re-fit as new data becomes available
Constraints
- Report goodness-of-fit statistics, not just parameters
- Document data collection methodology
- Consider censored or truncated data
- Test for time-varying parameters