from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import pandas as pd
def feature_selection(X_scaled, y):#基于模型的特征选择
'''
说明:
这是models的第一个函数
函数作用:
基于模型的特征选择,计算并打印各个特征在随机森林模型中的重要度
选取重要度在中位数以上的特征并返回
:param X_scaled: 预处理好的所有特征
:param y: 所有数据的标签
:return: 选择出的特征
'''
forest = RandomForestClassifier(n_estimators=100, random_state=42)#随机森林数量和随机数
clf = forest.fit(X_scaled, y)
print('特征得分:')
print(clf.feature_importances_)
selector = SelectFromModel(forest, threshold='median', prefit=False)#重要性高于中位数第二次特征
X_selected = selector.fit_transform(X_scaled, y)
X_selected = pd.DataFrame(X_selected, columns=X_scaled.columns[selector.get_support()])
print('选择的特征')
print(X_selected.columns)
return X_selected
#k折叠网格搜索调参
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
def train_models(X_selected, y):
'''
说明:这是models的第二个函数
函数作用:基于网格搜索得到最佳超参数并打印
返回用最佳超参数和训练集得到的model
:param X_selected: 训练集的特征
:param y: 训练集数据的标签
:return: 训练好的模型
'''
# 设置K折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# 网格搜索调参
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=kf,
scoring='accuracy')
grid_search.fit(X_selected, y)
best_params = grid_search.best_params_
print("最佳参数:", best_params)
return grid_search.best_estimator_