资金管理平台概率性现金流预测模型（机器学习）-Seo优化-塔城地区网站建设公司

特征工程

class CashFlowFeatureEngineer: """现金流特征工程""" def create_features(self, historical_data: pd.DataFrame, future_dates: pd.DatetimeIndex) -> pd.DataFrame: """ 创建机器学习特征 historical_data: 历史现金流数据 - date: 日期 - cash_flow: 实际现金流 - additional features... future_dates: 需要预测的未来日期 """ features = [] for date in future_dates: # 基础时间特征 base_features = { 'date': date, 'year': date.year, 'month': date.month, 'day': date.day, 'day_of_week': date.dayofweek, 'day_of_year': date.dayofyear, 'week_of_year': date.isocalendar().week, 'quarter': date.quarter, 'is_month_start': date.is_month_start, 'is_month_end': date.is_month_end, 'is_quarter_start': date.is_quarter_start, 'is_quarter_end': date.is_quarter_end, 'is_year_start': date.is_year_start, 'is_year_end': date.is_year_end, } # 业务特征（需要业务数据） business_features = self._extract_business_features(date) # 历史统计特征 historical_features = self._extract_historical_features(date, historical_data) # 合并所有特征 all_features = {**base_features, **business_features, **historical_features} features.append(all_features) return pd.DataFrame(features).set_index('date') def _extract_business_features(self, date: datetime) -> Dict: """提取业务特征""" # 从CRM、PM系统获取数据 return { 'expected_sales': self._get_expected_sales(date), 'project_count': self._get_active_project_count(date), 'employee_count': self._get_employee_count(date), 'seasonality_factor': self._get_seasonality_factor(date.month), 'working_days_in_month': self._get_working_days(date.year, date.month) } def _extract_historical_features(self, date: datetime, historical_data: pd.DataFrame) -> Dict: """从历史数据提取特征""" # 计算各种窗口统计量 end_date = date - timedelta(days=1) features = {} # 时间窗口 windows = [7, 14, 30, 60, 90, 180] for window in windows: start_date = end_date - timedelta(days=window) # 筛选窗口期内数据 mask = (historical_data.index >= start_date) & (historical_data.index <= end_date) window_data = historical_data.loc[mask, 'cash_flow'] if len(window_data) > 0: features[f'mean_{window}d'] = window_data.mean() features[f'std_{window}d'] = window_data.std() features[f'min_{window}d'] = window_data.min() features[f'max_{window}d'] = window_data.max() features[f'median_{window}d'] = window_data.median() features[f'skew_{window}d'] = window_data.skew() features[f'kurt_{window}d'] = window_data.kurtosis() # 自相关性特征 if len(window_data) >= 7: features[f'autocorr_7_{window}d'] = window_data.autocorr(lag=7) features[f'autocorr_30_{window}d'] = window_data.autocorr(lag=30) if len(window_data) >= 30 else np.nan # 同比特征（去年同月） if date.month == 2 and date.day == 29: # 处理闰年 last_year_date = datetime(date.year - 1, 2, 28) else: last_year_date = datetime(date.year - 1, date.month, date.day) if last_year_date in historical_data.index: features['yoy'] = historical_data.loc[last_year_date, 'cash_flow'] # 移动平均比（短期/长期） if 'mean_30d' in features and 'mean_90d' in features: features['ma_ratio_30_90'] = features['mean_30d'] / features['mean_90d'] if features['mean_90d'] != 0 else np.nan return features

XGBoost预测模型

import xgboost as xgb from sklearn.model_selection import TimeSeriesSplit, GridSearchCV from sklearn.metrics import mean_absolute_error, mean_squared_error import warnings warnings.filterwarnings('ignore') class XGBoostCashFlowPredictor: """基于XGBoost的现金流预测器""" def __init__(self): self.model = None self.feature_importance = None self.scaler = StandardScaler() def prepare_data(self, features: pd.DataFrame, target: pd.Series): """准备训练数据""" # 处理缺失值 features_filled = features.fillna(features.median()) # 标准化特征 features_scaled = self.scaler.fit_transform(features_filled) return features_scaled, target.values def train(self, X_train, y_train, cv_folds: int = 5): """训练模型（使用时间序列交叉验证）""" # 时间序列交叉验证 tscv = TimeSeriesSplit(n_splits=cv_folds) # XGBoost参数网格 param_grid = { 'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.05, 0.1], 'subsample': [0.8, 0.9, 1.0], 'colsample_bytree': [0.8, 0.9, 1.0], 'min_child_weight': [1, 3, 5] } # 创建基础模型 xgb_model = xgb.XGBRegressor( objective='reg:squarederror', random_state=42, n_jobs=-1 ) # 网格搜索 grid_search = GridSearchCV( estimator=xgb_model, param_grid=param_grid, cv=tscv, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1 ) # 训练 grid_search.fit(X_train, y_train) # 最佳模型 self.model = grid_search.best_estimator_ self.feature_importance = self.model.feature_importances_ print(f"Best parameters: {grid_search.best_params_}") print(f"Best CV score: {-grid_search.best_score_:.2f}") return grid_search.best_params_ def predict(self, X_test, return_std: bool = False): """预测现金流""" if self.model is None: raise ValueError("Model not trained yet!") X_test_scaled = self.scaler.transform(X_test) if return_std: # 使用quantile regression获取预测区间 # 训练多个分位数回归模型 quantiles = [0.05, 0.5, 0.95] predictions = [] for q in quantiles: model = xgb.XGBRegressor( objective='reg:quantileerror', quantile_alpha=q, **self.model.get_params() ) model.fit(self.model.get_booster().feature_names, self.model.get_booster().feature_types) pred = model.predict(X_test_scaled) predictions.append(pred) median_pred = predictions[1] # 0.5分位数 lower_bound = predictions[0] # 0.05分位数 upper_bound = predictions[2] # 0.95分位数 return median_pred, lower_bound, upper_bound else: return self.model.predict(X_test_scaled) def evaluate(self, y_true, y_pred): """评估模型性能""" metrics = { 'MAE': mean_absolute_error(y_true, y_pred), 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)), 'MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 'R2': self.model.score if hasattr(self.model, 'score') else None } return metrics

Prophet模型集成

python

from prophet import Prophet class ProphetCashFlowPredictor: """基于Prophet的现金流预测器（处理季节性和节假日）""" def __init__(self): self.model = None self.holidays = self._prepare_holidays() def _prepare_holidays(self): """准备节假日数据""" # 中国法定节假日 holidays = pd.DataFrame({ 'holiday': 'chinese_holiday', 'ds': pd.to_datetime([ '2024-01-01', # 元旦 '2024-02-10', '2024-02-11', '2024-02-12', # 春节 '2024-04-04', '2024-04-05', '2024-04-06', # 清明 '2024-05-01', '2024-05-02', '2024-05-03', # 劳动节 '2024-06-10', # 端午 '2024-09-17', # 中秋 '2024-10-01', '2024-10-02', '2024-10-03', # 国庆 ]), 'lower_window': 0, 'upper_window': 1, # 节假日前后一天也受影响 }) # 添加月末/季末效应（财务结算日） month_ends = pd.date_range(start='2023-01-01', end='2025-12-31', freq='M') quarter_ends = pd.date_range(start='2023-01-01', end='2025-12-31', freq='Q') financial_effects = pd.DataFrame({ 'holiday': 'financial_period_end', 'ds': pd.concat([month_ends, quarter_ends]).unique(), 'lower_window': -2, # 结算日前两天开始影响 'upper_window': 0, }) return pd.concat([holidays, financial_effects]) def prepare_data(self, historical_series: pd.Series): """准备Prophet格式数据""" df = historical_series.reset_index() df.columns = ['ds', 'y'] # 添加额外回归量 df['year'] = df['ds'].dt.year df['month'] = df['ds'].dt.month df['day_of_week'] = df['ds'].dt.dayofweek df['is_month_end'] = df['ds'].dt.is_month_end.astype(int) df['is_quarter_end'] = df['ds'].dt.is_quarter_end.astype(int) return df def train(self, df: pd.DataFrame): """训练Prophet模型""" self.model = Prophet( holidays=self.holidays, yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False, seasonality_mode='multiplicative', changepoint_prior_scale=0.05, holidays_prior_scale=10, seasonality_prior_scale=10, mcmc_samples=0 ) # 添加额外回归量 self.model.add_regressor('is_month_end') self.model.add_regressor('is_quarter_end') # 训练 self.model.fit(df) return self.model def predict(self, future_periods: int, freq: str = 'D'): """生成预测""" if self.model is None: raise ValueError("Model not trained yet!") # 创建未来数据框 future = self.model.make_future_dataframe( periods=future_periods, freq=freq, include_history=False ) # 添加回归量 future['is_month_end'] = future['ds'].dt.is_month_end.astype(int) future['is_quarter_end'] = future['ds'].dt.is_quarter_end.astype(int) # 预测 forecast = self.model.predict(future) return forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

集成预测框架

python

class EnsembleCashFlowForecaster: """集成预测框架""" def __init__(self): self.models = { 'xgb': XGBoostCashFlowPredictor(), 'prophet': ProphetCashFlowPredictor(), 'ensemble': None } self.weights = None def train_ensemble(self, historical_data: pd.DataFrame): """训练集成模型""" # 1. 准备数据 feature_engineer = CashFlowFeatureEngineer() # 使用最后90天作为验证集 train_end = historical_data.index[-90] train_data = historical_data[historical_data.index < train_end] val_data = historical_data[historical_data.index >= train_end] # 2. 训练各个基模型 predictions = {} # XGBoost print("Training XGBoost model...") X_train = feature_engineer.create_features(train_data, train_data.index) y_train = train_data['cash_flow'] self.models['xgb'].prepare_data(X_train, y_train) self.models['xgb'].train(X_train.values, y_train.values) # 在验证集上预测 X_val = feature_engineer.create_features(historical_data, val_data.index) y_val = val_data['cash_flow'] pred_xgb = self.models['xgb'].predict(X_val.values) predictions['xgb'] = pred_xgb # Prophet print("Training Prophet model...") prophet_df = self.models['prophet'].prepare_data(train_data['cash_flow']) self.models['prophet'].train(prophet_df) forecast = self.models['prophet'].predict(len(val_data), 'D') predictions['prophet'] = forecast['yhat'].values # 3. 优化集成权重 self.weights = self._optimize_weights(y_val.values, list(predictions.values())) print(f"Optimized weights: {self.weights}") return predictions def _optimize_weights(self, y_true, predictions_list): """优化集成权重""" from scipy.optimize import minimize def objective(weights): """最小化集成预测的MSE""" # 权重归一化 weights = weights / weights.sum() # 计算加权平均预测 weighted_pred = np.zeros_like(predictions_list[0]) for i, pred in enumerate(predictions_list): weighted_pred += weights[i] * pred # 计算MSE mse = np.mean((y_true - weighted_pred) ** 2) return mse # 约束：权重和为1，非负 n_models = len(predictions_list) initial_weights = np.ones(n_models) / n_models bounds = [(0, 1) for _ in range(n_models)] constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1} result = minimize( objective, initial_weights, bounds=bounds, constraints=constraints, method='SLSQP' ) return result.x / result.x.sum() # 确保归一化 def forecast(self, future_dates: pd.DatetimeIndex, historical_data: pd.DataFrame): """生成集成预测""" if self.weights is None: raise ValueError("Ensemble not trained yet!") # 获取各个模型的预测 feature_engineer = CashFlowFeatureEngineer() predictions = [] # XGBoost预测 X_future = feature_engineer.create_features(historical_data, future_dates) pred_xgb, lower_xgb, upper_xgb = self.models['xgb'].predict(X_future.values, return_std=True) predictions.append(pred_xgb) # Prophet预测 prophet_df = self.models['prophet'].prepare_data(historical_data['cash_flow']) self.models['prophet'].train(prophet_df) forecast = self.models['prophet'].predict(len(future_dates), 'D') predictions.append(forecast['yhat'].values) # 计算加权平均 ensemble_pred = np.zeros_like(predictions[0]) for i, pred in enumerate(predictions): ensemble_pred += self.weights[i] * pred # 计算预测区间（使用最宽区间） lower_bound = np.minimum(lower_xgb, forecast['yhat_lower'].values) upper_bound = np.maximum(upper_xgb, forecast['yhat_upper'].values) result = pd.DataFrame({ 'date': future_dates, 'predicted_cash_flow': ensemble_pred, 'lower_bound': lower_bound, 'upper_bound': upper_bound, 'confidence_interval': upper_bound - lower_bound }).set_index('date') return result