diff --git a/analysis.py b/analysis.py index 69fc53d..7001b70 100644 --- a/analysis.py +++ b/analysis.py @@ -1,34 +1,39 @@ import pandas as pd -from sklearn.model_selection import train_test_split, KFold, cross_val_score -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import accuracy_score, classification_report, confusion_matrix +from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.svm import SVC +from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, recall_score, precision_score from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA import matplotlib.pyplot as plt pca_ccr = 0.98 # 主成分分析の累積寄与率の閾値 -df = pd.DataFrame(columns=['input', 'target', 'pca_dim', 'accuracy', 'recall', 'precision', 'F1', 'AUC']) +n_folds = 5 # クロスバリデーションの分割数 +df = pd.DataFrame( + columns=['DataType','Input', 'Target', 'Model', + 'PCAdim', 'Accuracy', 'Recall', 'Precision', 'F1', 'AUC'] + ) df_index = 0 def make_dataset(): """ データセットを作成する関数 """ - print("\nPreparing dataset...") - all_data = pd.read_csv('SmTIAS_PhoneApp.csv') - input = df.loc[df_index, 'input'] # 入力の種類を取得 + # print("\nPreparing dataset...") + all_data = pd.read_csv(f'Merged_{df.loc[df_index, 'DataType']}App.csv') + input = df.loc[df_index, 'Input'] # 入力の種類を取得 if input == "shape": - print("Using shape features...") + # print("Using shape features...") x = all_data.loc[:, 'shape-width':'shape-bottomRightY'] elif input == "color": - print("Using color features...") + # print("Using color features...") x = all_data.loc[:, 'chiu-lateral-L-min':'fiveClick-tip-b-kurtosis'] elif input == "texture": - print("Using texture features...") + # print("Using texture features...") x = all_data.loc[:, 'chiu-lateral-contrast':'fiveClick-tip-correlation'] else: - print("Using all features...") + # print("Using all features...") x = all_data.loc[:, 'shape-width':'fiveClick-tip-b-kurtosis'] # 全データ scores = all_data.loc[:, 'A01':'C08'] # スコア invert_list = ['A01', 'A02', 'A03'] @@ -38,9 +43,9 @@ scores['B'] = scores.loc[:, 'B07':'B29'].sum(axis=1) # B群 scores['C'] = scores.loc[:, 'C01':'C08'].sum(axis=1) # C群 scores['Total'] = scores.loc[:, 'A':'C'].sum(axis=1) # 総合スコア - target = df.loc[df_index, 'target'] # ターゲットスコア + target = df.loc[df_index, 'Target'] # ターゲットスコア threshold = scores[target].median() # 中央値を閾値とする - print(f'Threshold for {target}: {threshold}') + # print(f'Threshold for {target}: {threshold}') # scores['Total'].plot.hist(bins=20, edgecolor='black') # ヒストグラムを描画 # import matplotlib.pyplot as plt # plt.title('Total Score Distribution') @@ -52,28 +57,29 @@ # print(scores.head(3)) return x, scores['label'] -def cross_val(x, y): +def cross_val(model, x, y): """ クロスバリデーションを行う関数 """ - print("\nStarting cross-validation...") + # print("\nStart cross-validation") pca_dim = calc_pca_dim(x) - kfold_cv = KFold(n_splits=5, shuffle=True) + df.loc[df_index, 'PCAdim'] = pca_dim # 主成分分析の次元数を保存 + kfold_cv = KFold(n_splits=n_folds, shuffle=True) pipe = Pipeline([("scaler", StandardScaler()), ("pca", PCA(n_components=pca_dim)), - ("model", RandomForestClassifier())]) - accuracy = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="accuracy").mean() - recall = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="recall").mean() - precision = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="precision").mean() - F1 = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="f1").mean() - auc = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="roc_auc").mean() - print(f"Accuracy: {accuracy:.3f}, recall: {recall:.3f}, precision: {precision:.3f}, F1: {F1:.3f}, AUC: {auc:.3f}") + ("model", model)]) + df.loc[df_index, 'Accuracy'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="accuracy").mean() + df.loc[df_index, 'Recall'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="recall").mean() + df.loc[df_index, 'Precision'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="precision").mean() + df.loc[df_index, 'F1'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="f1").mean() + df.loc[df_index, 'AUC'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="roc_auc").mean() + # print(f"Accuracy: {accuracy:.3f}, recall: {recall:.3f}, precision: {precision:.3f}, F1: {F1:.3f}, AUC: {auc:.3f}") def calc_pca_dim(x): """ 主成分分析の次元数を計算する関数 """ - print("\nCalculating PCA dimensions...") + # print("\nCalculating PCA dimensions...") scaler = StandardScaler() x_scaled = scaler.fit_transform(x) pca = PCA(n_components=None) # 主成分分析の次元数を入力データの次元数に設定 @@ -84,7 +90,7 @@ ccr += pca.explained_variance_ratio_[i] if ccr >= pca_ccr: pca_dim = i + 1 # 次元数は0から始まるので1を足す - print(f'Number of components to reach 95% variance: {pca_dim}') + # print(f'Number of components to reach 95% variance: {pca_dim}') break return pca_dim @@ -120,9 +126,28 @@ # メイン関数 if __name__ == "__main__": - df.loc[df_index,:] = ['all', 'Total', 0, 0, 0, 0, 0, 0] # 初期化 - x, y = make_dataset() # 入力とターゲットを指定 - train_predict(x, y) # モデルの訓練と予測を実行 - # cross_val(x, y, df[idx]) # クロスバリデーションを実行 + + print("Starting analysis...") + data_type_list = ["SmTIAS_Phone", "SmTIAS_Web", "HandyTCC_Phone", "HandyTCC_Web"] # データセットの種類 + input_list = ["shape", "color", "texture", "all"] # 入力の種類 + target_list = ["A", "B", "C", "Total"] # ターゲットスコア + model_list = [RandomForestClassifier(), + AdaBoostClassifier(), + SVC(),] # モデルの種類 + counts = len(data_type_list) * len(input_list) * len(target_list) * len(model_list) + + for data_type in data_type_list: + for input in input_list: # 入力の種類を指定 + for target in target_list: + for model in model_list: + model_name = model.__class__.__name__.replace("Classifier", "") # モデル名から"Classifier"を削除 + print(f"\nAnalyzing {df_index+1}/{counts} {data_type}, {input}, {target}, {model_name}") + df.loc[df_index,:] = [data_type, input, target, model_name, 0, 0, 0, 0, 0, 0] # 初期化 + x, y = make_dataset() # 入力とターゲットを指定 + # train_predict(x, y) # モデルの訓練と予測を実行 + cross_val(model, x, y) # クロスバリデーションを実行 + df_index += 1 + print("\nAnalysis complete.") print(df) # 結果を表示 + df.to_csv('analysis_results.csv', index=False) # 結果をCSVファイルに保存