from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import pandas as pd
def feature_selection(X_scaled, y):#基于模型的特征选择
    '''
    说明:
    这是models的第一个函数
    函数作用:
    基于模型的特征选择,计算并打印各个特征在随机森林模型中的重要度
    选取重要度在中位数以上的特征并返回
    :param X_scaled: 预处理好的所有特征
    :param y: 所有数据的标签
    :return: 选择出的特征
    '''
    forest = RandomForestClassifier(n_estimators=100, random_state=42)#随机森林数量和随机数
    clf = forest.fit(X_scaled, y)
    print('特征得分:')
    print(clf.feature_importances_)
    selector = SelectFromModel(forest, threshold='median', prefit=False)#重要性高于中位数第二次特征
    X_selected = selector.fit_transform(X_scaled, y)
    X_selected = pd.DataFrame(X_selected, columns=X_scaled.columns[selector.get_support()])
    print('选择的特征')
    print(X_selected.columns)
    return X_selected


#k折叠网格搜索调参
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

def train_models(X_selected, y):
    '''
    说明:这是models的第二个函数
    函数作用:基于网格搜索得到最佳超参数并打印
    返回用最佳超参数和训练集得到的model
    :param X_selected: 训练集的特征
    :param y: 训练集数据的标签
    :return: 训练好的模型
    '''
    # 设置K折交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    # 网格搜索调参
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=kf,
                               scoring='accuracy')
    grid_search.fit(X_selected, y)
    best_params = grid_search.best_params_
    print("最佳参数:", best_params)
    return grid_search.best_estimator_