-
-
[原创]FLAML 自动建模测试
-
发表于: 1天前 307
-
"""
FLAML 自动建模初始
"""
import time
start_time = time.time()
from flaml import AutoML
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import sys
import os
sys.path.append(r'D:/python')
from utils import datatls as dtls
from utils import mdtools as mdtl
# 导入 optbinning
from optbinning import BinningProcess, OptimalBinning
from optbinning import BinningProcess
import joblib
#%% 加载数据
#%% 筛选客群
# kw_model = '高反专项第9次建模'
# sqldf = cap[cap['返利'] >= 0.1]
kw_model = '二手车lgbm模型V7 数据'
sqldf = cap
# 建立专项文件夹
basepa = r'\子模型项目\%s' % kw_model
os.makedirs(basepa, exist_ok=True)
def xieru(info):
basepatxt = basepa + '\\模型信息.txt'
with open(basepatxt, 'a', encoding='utf-8') as f:
f.write(info + '\n')
datas = dtls.get_abcddf(start, end, sqldf)
xieru(f'=========={kw_model}===============')
datas = datas.drop_duplicates(subset=['query_id'])
datas['起租月份'] = datas['起租日_乘'].dt.strftime('%Y%m')
cc=datas.columns
#%% 获取因子与时间划分
# xieru('不限制时间训练10000次')
# 训练集 202401-09
traindf = datas[datas['起租月份'].between('202401','202409')]
# 测试集 202410-12
testdf = datas[datas['起租月份'].between('202410','202412')]
# 验证集 202501-02
ootdf = datas[datas['起租月份'].between('202501','202502')]
cctdf = datas[datas['起租月份'].between('202503','202504')]
testood=datas[datas['起租月份'].between('202410','202502')]
cctdf.columns
data = traindf.copy()
xieru(f'训练起租时间:{traindf["起租月份"].unique()}')
xieru(f'测试集起租时间:{testdf["起租月份"].unique()}')
xieru(f'时间外验证起租时间:{ootdf["起租月份"].unique()}')
print("✅ 正在进行数据预处理...")
dropcol = colsdf['字段含义'].tolist()
dropcol.extend(['9pd30', 'data_time', 'query_id', '返利'])
X = data.drop(columns=dropcol)
y = data['9pd30']
# 缺失值填充(众数)
X = X.fillna(X.mode().iloc[0]) # 修正:取第一行众数
# 筛选数值型变量(用于后续分箱)
cc = X.describe().T
cc2 = cc[cc['std'] != 0]
selected_features = cc2.index.tolist()
# 类别变量 Label Encoding
le = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
X[col] = le.fit_transform(X[col])
# 保存原始特征(用于后续分箱)
X_raw = X[selected_features].copy()
# 筛选变量
print("✅ 筛选变量...")
selected_features, all_results,corr_matrix,corr_matrix_final = mdtl.filter_features_by_criteria(
df=pd.concat([X_raw,y],axis=1),
target_col='9pd30',
feature_cols=cc2.index.tolist(),
iv_threshold=0.02,
spearman_threshold=0.02,
spearman_threshold_fea=0.7,
vif_threshold=10,
max_bins=8,
min_bin_size=0.03,
xieru=xieru)
selected_features.to_csv(basepa+'\\已选特征信息.csv')
corr_matrix_final.to_csv(basepa+'\\特征之间相关系数最终变量.csv')
se_fealist=selected_features['feature'].tolist()
#%% 第一次建模:FLAML
print(" 正在进行第一次建模(FLAML)")
def custom_ks_metric(
X_val, y_val,
estimator,
labels,
X_train, y_train,
weight_val=None, weight_train=None,
config=None, groups_val=None, groups_train=None
):
try:
y_pred = estimator.predict_proba(X_val)[:, 1]
except:
return 1.0, {"ks": 0.0}
fpr, tpr, _ = roc_curve(y_val, y_pred)
ks = np.max(tpr - fpr)
return 1 - ks, {"ks": ks}
# FLAML 设置:仅训练 LGBM/XGB,时间预算 300s
automl = AutoML()
automl_settings = {
"task": "classification",
"metric": custom_ks_metric,
# 'time_budget':-1, # 无时间限制
"time_budget": 60*30,
# 'max_iter':10000, # 最多 10000 次 trial
# "estimator_list": ["lgbm", "xgboost"],# 聚焦一个模型,避免分散 trial
"estimator_list": ["lgbm"],# 聚焦一个模型,避免分散 trial
"eval_method": "holdout",
"seed": 42,
"verbose": 1,
"early_stop": True,
"retrain_full": True,
"log_file_name": basepa + "\\flaml_model.log",
"n_jobs": -1,
}
# 训练 FLAML
automl.fit(X_train=traindf[se_fealist], y_train=traindf['9pd30'], X_val=testood[se_fealist], y_val=testood['9pd30'], **automl_settings)
print(f"✅ 第一次建模完成")
print(f"最优模型: {automl.best_estimator}")
xieru(f"最优模型: {automl.best_estimator}")
print(f"最优参数: {automl.best_config}")
xieru(f"最优参数: {automl.best_config}")
#%% 提取特征重要性
importance_df = pd.DataFrame({
"feature": traindf[se_fealist].columns,
"importance": automl.model.feature_importances_
}).sort_values("importance", ascending=False)
importance_df=importance_df[importance_df['importance']>0]
# 保存重要性
importance_df.to_csv(basepa + "\\feature_importance.csv", index=False)
#%% 保存模型
joblib.dump(automl, basepa + "\\flaml_model.pkl")
print("✅ 建模完成:模型已保存")
#%% 评估与可视化
print(" 正在评估模型性能...")
diccan = {
'训练': {'X': traindf[se_fealist], 'y': traindf['9pd30']},
'测试': {'X': testdf[se_fealist], 'y': testdf['9pd30']},
'时间外验证': {'X': ootdf[se_fealist], 'y': ootdf['9pd30']},
'时间外2验证(25年3月4月)': {'X': cctdf[se_fealist], 'y': cctdf['9pd30']}
}
scoredic={}
ydic={}
for k1, v1 in diccan.items():
xieru(f'{"="*10}{k1}{"="*10}')
basepa2 = basepa + f"\\{k1}"
os.makedirs(basepa2, exist_ok=True)
X = v1['X']
y = v1['y']
# 预测概率
y_pred_proba = automl.predict_proba(X)[:, 1]
y_pred = automl.predict(X)
# 评分卡输出(使用原始分数)
badrate=1/80
# 评分等频/等距分箱
resscore,score=mdtl.ScoreBin(y_pred_proba, y,badrate=badrate)
resscore1, _ = mdtl.ScoreBin(y_pred_proba, y, badrate=badrate, qcut=1, n=7)
resscore.to_csv(basepa2 + f"\\{k1}_评分等频分箱单调性.csv", index=False)
resscore1.to_csv(basepa2 + f"\\{k1}_评分等距分箱单调性.csv", index=False)
scoredic[k1]=score
ydic[k1]=y.reset_index()['9pd30']
X.to_csv(basepa2 + f"\\特征.csv", index=False)
score.to_csv(basepa2 + f"\\target_score.csv", index=False)
# 指标计算
auc = roc_auc_score(y, y_pred_proba)
resc=mdtl.KS(score['score'],y,out=1)
ks=max(resc['KS'])
xieru(f' {k1} AUC: {auc}')
xieru(f' {k1} ks: {ks}')
# 可视化
mdtl.PlotKS(resc, pa=basepa2 + "\\score_ks.png")
mdtl.PlotROC(score['score'],y,pa=basepa2+'\\score_ROC.png',xieru=xieru)
mdtl.PlotDistribution(score['score'],y,score=1,pa=basepa2+'\\Distribution.png',xieru=xieru)
mdtl.PlotCM(score['score'],y,score=1,pa=basepa2+'\\score_CM.png',xieru=xieru)
print("✅ 建模全流程完成,模型与评估结果已保存。")
psi1=mdtl.PSI2(scoredic['训练']['score'], scoredic['测试']['score'],Y_A=ydic['训练'],Y_E=ydic['测试'])
psi2=mdtl.PSI2(scoredic['训练']['score'], scoredic['时间外验证']['score'],Y_A=ydic['训练'],Y_E=ydic['时间外验证'])
psi2=mdtl.PSI2(scoredic['训练']['score'], scoredic['时间外验证']['score'],Y_A=ydic['训练'],Y_E=ydic['时间外验证'])
xieru('==========稳定性============================')
xieru(f'测试稳定性 psi: {psi1["PSI"].sum()}')
xieru(f'时间外验证 psi: {psi2["PSI"].sum()}')
psi1.to_csv(basepa + f"\\测试psi稳定性.csv",index=False)
psi2.to_csv(basepa + f"\\时间外验证psi稳定性.csv",index=False)
end_time = time.time()
print(f"耗时: {round(end_time - start_time, 2)} 秒")
xieru(f"耗时: {round(end_time - start_time, 2)} 秒")
#%%
cctdf.columns
score=cctdf[['蜜蜂分_高利率版_6_2','9pd30']]
score.columns = ['score', 'target']
resc=mdtl.KS(score['score'],score['target'],out=1)
ks=max(resc['KS'])
print(ks)
score['score'] = np.round(score['score'], 4)
scoretmp=score.copy()
score['qcut'] = pd.qcut(score['score'], 10, duplicates='drop')
score['qcut'] = pd.DataFrame(score['qcut'], dtype='object')
score = score.pivot_table(index = 'qcut', values = 'target', aggfunc = [np.sum, np.size])
score['rate'] = score['sum'] / score['size']
score.columns = score.columns.droplevel(1)
score = score.reset_index()
score = score.rename(columns={'sum': '#Bad'})
score = score.rename(columns={'size': '#All'})
score = score.rename(columns={'rate': '%BadRate'})
score['%All'] = score['#All']/sum(score['#All'])
score['#CumAll'] = score['#All'].cumsum()
score['%CumAll'] = score['#CumAll'] / max(score['#CumAll'])
赞赏
他的文章
- [原创]FLAML 自动建模测试 308
- [原创]mlops测试 311
- [原创]建模常用库 315
- [原创]机器学习基础-特征工程2分箱 1142
- [原创]机器学习基础-特征工程1 714
赞赏
雪币:
留言: