zeroMN commited on
Commit
7120259
·
verified ·
1 Parent(s): 8dd969b

Upload 4 files

Browse files
Files changed (4) hide show
  1. test.py +56 -0
  2. test2.py +74 -0
  3. test3.py +87 -0
  4. test4.py +72 -0
test.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from ucimlrepo import fetch_ucirepo
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.ensemble import RandomForestRegressor
5
+ import joblib
6
+ import matplotlib.pyplot as plt
7
+
8
+
9
+ # 获取数据集
10
+ student_performance = fetch_ucirepo(id=320)
11
+
12
+ # 获取特征和目标
13
+ X = student_performance.data.features
14
+ y = student_performance.data.targets
15
+
16
+ # 查看特征和目标的前几行
17
+ print(X.head())
18
+ print(y.head())
19
+
20
+ # 编码分类变量
21
+ X = pd.get_dummies(X, drop_first=True)
22
+
23
+ # 划分训练集和测试集
24
+ X_train, X_test, y_train, y_test = train_test_split(X, y['G3'], test_size=0.2, random_state=42)
25
+
26
+ # 创建并训练模型
27
+ model = RandomForestRegressor(n_estimators=100, random_state=42)
28
+ model.fit(X_train, y_train)
29
+
30
+ # 保存模型
31
+ model_path = "C:/Users/baby7/Desktop/推理/model_checkpoints/random_forest_model.pkl"
32
+ joblib.dump(model, model_path)
33
+ print(f"模型已保存到 {model_path}")
34
+
35
+ # 加载模型
36
+ loaded_model = joblib.load(model_path)
37
+ print("模型已加载")
38
+
39
+ # 使用加载的模型进行预测
40
+ y_pred = loaded_model.predict(X_test) # X_test 是您的测试数据
41
+ print("预测结果:", y_pred)
42
+
43
+ # 评估模型性能
44
+ from sklearn.metrics import mean_squared_error
45
+
46
+ mse = mean_squared_error(y_test, y_pred)
47
+ print(f'均方误差: {mse:.2f}')
48
+
49
+ import matplotlib.pyplot as plt
50
+
51
+ plt.scatter(y_test, y_pred)
52
+ plt.xlabel('真实值')
53
+ plt.ylabel('预测值')
54
+ plt.title('真实值与预测值对比')
55
+ plt.plot([0, 20], [0, 20], color='red', linestyle='--') # 参考线
56
+ plt.show()
test2.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.metrics import classification_report
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+
8
+ # 数据集 URL
9
+ data_url = 'https://archive.ics.uci.edu/static/public/17/data.csv'
10
+
11
+ # 加载数据集
12
+ df = pd.read_csv(data_url)
13
+
14
+ # 查看数据集的前几行
15
+ print("数据集的前几行:")
16
+ print(df.head())
17
+
18
+ # 数据预处理
19
+ # 编码目标变量(将 M 和 B 转换为 1 和 0)
20
+ df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0})
21
+
22
+ # 特征和目标
23
+ X = df.drop(columns=['ID', 'Diagnosis']) # 特征
24
+ y = df['Diagnosis'] # 目标
25
+
26
+ # 划分训练集和测试集
27
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
28
+
29
+ # 训练模型
30
+ model = RandomForestClassifier(random_state=42)
31
+ model.fit(X_train, y_train)
32
+
33
+ # 预测
34
+ y_pred = model.predict(X_test)
35
+
36
+ # 输出分类报告
37
+ print("\n分类报告:")
38
+ print(classification_report(y_test, y_pred))
39
+
40
+ # 可视化特征重要性
41
+ feature_importances = model.feature_importances_
42
+ features = X.columns
43
+ indices = range(len(features))
44
+
45
+ # 创建条形图
46
+ plt.figure(figsize=(12, 6))
47
+ sns.barplot(x=feature_importances, y=features)
48
+ plt.title('特征重要性')
49
+ plt.xlabel('重要性')
50
+ plt.ylabel('特征')
51
+ plt.show()
52
+
53
+ ####################################################################
54
+ from ucimlrepo import fetch_ucirepo
55
+
56
+ # fetch dataset
57
+ breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)
58
+
59
+ # data (as pandas dataframes)
60
+ X = breast_cancer_wisconsin_diagnostic.data.features
61
+ y = breast_cancer_wisconsin_diagnostic.data.targets
62
+
63
+ # metadata
64
+ print(breast_cancer_wisconsin_diagnostic.metadata)
65
+
66
+ # variable information
67
+ print(breast_cancer_wisconsin_diagnostic.variables)
68
+
69
+
70
+ ##################################################################
71
+ # 0 0.96 0.99 0.97 71
72
+ # 1 0.98 0.93 0.95 43
73
+
74
+ #accuracy 0.96 114
test3.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.metrics import classification_report, confusion_matrix
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+
8
+ # 数据集 URL
9
+ data_url = 'https://archive.ics.uci.edu/static/public/15/data.csv'
10
+
11
+ # 加载数据集
12
+ df = pd.read_csv(data_url)
13
+
14
+ # 查看数据集的前几行
15
+ print("数据集的前几行:")
16
+ print(df.head())
17
+
18
+ # 数据预处理
19
+ # 处理缺失值(将 '?' 替换为 NaN)
20
+ df['Bare_nuclei'] = df['Bare_nuclei'].replace('?', None).astype(float) # 将 '?' 替换为 None
21
+ df = df.dropna() # 删除含有缺失值的行
22
+
23
+ # 编码目标变量(将 2 和 4 转换为 0 和 1)
24
+ df['Class'] = df['Class'].map({2: 0, 4: 1})
25
+
26
+ # 特征和目标
27
+ X = df.drop(columns=['Sample_code_number', 'Class']) # 特征
28
+ y = df['Class'] # 目标
29
+
30
+ # 划分训练集和测试集
31
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
32
+
33
+ # 训练模型
34
+ model = RandomForestClassifier(random_state=42)
35
+ model.fit(X_train, y_train)
36
+
37
+ # 预测
38
+ y_pred = model.predict(X_test)
39
+
40
+ # 输出分类报告
41
+ print("\n分类报告:")
42
+ print(classification_report(y_test, y_pred))
43
+
44
+ # 可视化混淆矩阵
45
+ cm = confusion_matrix(y_test, y_pred)
46
+ plt.figure(figsize=(8, 6))
47
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
48
+ plt.ylabel('Actual')
49
+ plt.xlabel('Predicted')
50
+ plt.title('Confusion Matrix')
51
+ plt.show()
52
+
53
+ # 可视化特征重要性
54
+ feature_importances = model.feature_importances_
55
+ features = X.columns
56
+ indices = range(len(features))
57
+
58
+ # 创建条形图
59
+ plt.figure(figsize=(12, 6))
60
+ sns.barplot(x=feature_importances, y=features)
61
+ plt.title('Feature Importance')
62
+ plt.xlabel('Importance')
63
+ plt.ylabel('Feature')
64
+ plt.show()
65
+
66
+
67
+ ###############################################
68
+ from ucimlrepo import fetch_ucirepo
69
+
70
+ # fetch dataset
71
+ breast_cancer_wisconsin_original = fetch_ucirepo(id=15)
72
+
73
+ # data (as pandas dataframes)
74
+ X = breast_cancer_wisconsin_original.data.features
75
+ y = breast_cancer_wisconsin_original.data.targets
76
+
77
+ # metadata
78
+ print(breast_cancer_wisconsin_original.metadata)
79
+
80
+ # variable information
81
+ print(breast_cancer_wisconsin_original.variables)
82
+
83
+ ##########################################################
84
+ # 0 0.93 0.99 0.96 79
85
+ # 1 0.98 0.90 0.94 58
86
+
87
+ #accuracy 0.95 137
test4.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.metrics import classification_report, confusion_matrix
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+
9
+ # 数据集 URL
10
+ data_url = 'https://archive.ics.uci.edu/static/public/591/data.csv'
11
+
12
+ # 加载数据集
13
+ df = pd.read_csv(data_url)
14
+
15
+ # 查看数据集的前几行
16
+ print("数据集的前几行:")
17
+ print(df.head())
18
+
19
+ # 数据预处理
20
+ # 将 Gender 列中的 M 和 F 转换为 1 和 0
21
+ df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})
22
+
23
+ # 特征和目标
24
+ X = df[['Name', 'Count', 'Probability']] # 特征
25
+ y = df['Gender'] # 目标
26
+
27
+ # 使用 TfidfVectorizer 对 Name 特征进行处理
28
+ vectorizer = TfidfVectorizer()
29
+ X_name = vectorizer.fit_transform(X['Name'])
30
+
31
+ # 将 Count 和 Probability 特征与 Name 特征合并
32
+ import scipy
33
+ X_combined = scipy.sparse.hstack((X_name, X[['Count', 'Probability']].values))
34
+
35
+ # 划分训练集和测试集
36
+ X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
37
+
38
+ # 训练模型
39
+ model = RandomForestClassifier(random_state=42)
40
+ model.fit(X_train, y_train)
41
+
42
+ # 预测
43
+ y_pred = model.predict(X_test)
44
+
45
+ # 输出分类报告
46
+ print("\n分类报告:")
47
+ print(classification_report(y_test, y_pred))
48
+
49
+ # 可视化混淆矩阵
50
+ cm = confusion_matrix(y_test, y_pred)
51
+ plt.figure(figsize=(8, 6))
52
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Female', 'Male'], yticklabels=['Female', 'Male'])
53
+ plt.ylabel('Actual')
54
+ plt.xlabel('Predicted')
55
+ plt.title('Confusion Matrix')
56
+ plt.show()
57
+
58
+ #############################################
59
+ from ucimlrepo import fetch_ucirepo
60
+
61
+ # fetch dataset
62
+ gender_by_name = fetch_ucirepo(id=591)
63
+
64
+ # data (as pandas dataframes)
65
+ X = gender_by_name.data.features
66
+ y = gender_by_name.data.targets
67
+
68
+ # metadata
69
+ print(gender_by_name.metadata)
70
+
71
+ # variable information
72
+ print(gender_by_name.variables)