diff --git a/README.md b/README.md new file mode 100644 index 0000000..4cd82a9 --- /dev/null +++ b/README.md @@ -0,0 +1,17 @@ +ストレス調査データ解析 +----- + +## analysis.py + +機械学習によるストレス有無のクラス分類 + + +## make_data.py +データ合成プログラム + +survey_data_add.csv と DeepTIAS_Phone/Web.csv +をマージして1つのcsvを生成 + +## 参考情報 + +[厚生労働省ストレスチェック関連情報](https://stresscheck.mhlw.go.jp/material.html) diff --git a/analysis.py b/analysis.py new file mode 100644 index 0000000..69fc53d --- /dev/null +++ b/analysis.py @@ -0,0 +1,128 @@ +import pandas as pd +from sklearn.model_selection import train_test_split, KFold, cross_val_score +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score, classification_report, confusion_matrix +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +from sklearn.decomposition import PCA +import matplotlib.pyplot as plt + +pca_ccr = 0.98 # 主成分分析の累積寄与率の閾値 +df = pd.DataFrame(columns=['input', 'target', 'pca_dim', 'accuracy', 'recall', 'precision', 'F1', 'AUC']) +df_index = 0 + +def make_dataset(): + """ + データセットを作成する関数 + """ + print("\nPreparing dataset...") + all_data = pd.read_csv('SmTIAS_PhoneApp.csv') + input = df.loc[df_index, 'input'] # 入力の種類を取得 + if input == "shape": + print("Using shape features...") + x = all_data.loc[:, 'shape-width':'shape-bottomRightY'] + elif input == "color": + print("Using color features...") + x = all_data.loc[:, 'chiu-lateral-L-min':'fiveClick-tip-b-kurtosis'] + elif input == "texture": + print("Using texture features...") + x = all_data.loc[:, 'chiu-lateral-contrast':'fiveClick-tip-correlation'] + else: + print("Using all features...") + x = all_data.loc[:, 'shape-width':'fiveClick-tip-b-kurtosis'] # 全データ + scores = all_data.loc[:, 'A01':'C08'] # スコア + invert_list = ['A01', 'A02', 'A03'] + for invert in invert_list: + scores[invert] = 5 - scores[invert] # 逆転スコア + scores['A'] = scores.loc[:, 'A01':'A10'].sum(axis=1) # A群 + scores['B'] = scores.loc[:, 'B07':'B29'].sum(axis=1) # B群 + scores['C'] = scores.loc[:, 'C01':'C08'].sum(axis=1) # C群 + scores['Total'] = scores.loc[:, 'A':'C'].sum(axis=1) # 総合スコア + target = df.loc[df_index, 'target'] # ターゲットスコア + threshold = scores[target].median() # 中央値を閾値とする + print(f'Threshold for {target}: {threshold}') + # scores['Total'].plot.hist(bins=20, edgecolor='black') # ヒストグラムを描画 + # import matplotlib.pyplot as plt + # plt.title('Total Score Distribution') + # plt.xlabel('Total Score') + # plt.ylabel('Frequency') + # plt.show() # ヒストグラムを表示 + scores['label'] = 0 + scores.loc[scores[target] >= threshold, 'label'] = 1 # ラベル付け + # print(scores.head(3)) + return x, scores['label'] + +def cross_val(x, y): + """ + クロスバリデーションを行う関数 + """ + print("\nStarting cross-validation...") + pca_dim = calc_pca_dim(x) + kfold_cv = KFold(n_splits=5, shuffle=True) + pipe = Pipeline([("scaler", StandardScaler()), + ("pca", PCA(n_components=pca_dim)), + ("model", RandomForestClassifier())]) + accuracy = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="accuracy").mean() + recall = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="recall").mean() + precision = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="precision").mean() + F1 = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="f1").mean() + auc = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="roc_auc").mean() + print(f"Accuracy: {accuracy:.3f}, recall: {recall:.3f}, precision: {precision:.3f}, F1: {F1:.3f}, AUC: {auc:.3f}") + +def calc_pca_dim(x): + """ + 主成分分析の次元数を計算する関数 + """ + print("\nCalculating PCA dimensions...") + scaler = StandardScaler() + x_scaled = scaler.fit_transform(x) + pca = PCA(n_components=None) # 主成分分析の次元数を入力データの次元数に設定 + pca.fit(x_scaled) + ccr = 0 + pca_dim = 0 + for i in range(pca.n_components_): + ccr += pca.explained_variance_ratio_[i] + if ccr >= pca_ccr: + pca_dim = i + 1 # 次元数は0から始まるので1を足す + print(f'Number of components to reach 95% variance: {pca_dim}') + break + return pca_dim + +def train_predict(x, y): + """ + モデルを訓練し、予測を行う関数 + """ + # データセットを訓練用とテスト用に分割 + print("\nStarting model training and prediction...") + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) + input_dim = x_train.shape[1] + print(f"Input dim:{input_dim} Train:{x_train.shape[0]}, Test: {x_test.shape[0]}") + + pca_dim = calc_pca_dim(x_train) + pipe = Pipeline([("scaler", StandardScaler()), + ("pca", PCA(n_components=pca_dim)), + ("model", RandomForestClassifier())]) + + pipe.fit(x_train, y_train) + y_pred = pipe.predict(x_test) + # scaler = StandardScaler() + # x_train = scaler.fit_transform(x_train) + # x_test = scaler.transform(x_test) + # model = RandomForestClassifier() + # model.fit(x_train, y_train) + # y_pred = model.predict(x_test) + accuracy = accuracy_score(y_test, y_pred) + df['accuracy'] = accuracy + print(f'Accuracy: {accuracy:.2f}') + # print(classification_report(y_test, y_pred)) + print('Confusion Matrix:') + print(confusion_matrix(y_test, y_pred)) + +# メイン関数 +if __name__ == "__main__": + df.loc[df_index,:] = ['all', 'Total', 0, 0, 0, 0, 0, 0] # 初期化 + x, y = make_dataset() # 入力とターゲットを指定 + train_predict(x, y) # モデルの訓練と予測を実行 + # cross_val(x, y, df[idx]) # クロスバリデーションを実行 + print("\nAnalysis complete.") + print(df) # 結果を表示 diff --git a/make_data.py b/make_data.py new file mode 100644 index 0000000..066ba14 --- /dev/null +++ b/make_data.py @@ -0,0 +1,19 @@ +import pandas as pd + +target_list = ["SmTIAS_WebApp","HandyTCC_WebApp","SmTIAS_PhoneApp","HandyTCC_PhoneApp"] +survey = pd.read_csv('survey_data_add.csv') + +def make_data(target): + type = target.split('_')[1] + type = type.replace('App', '') + print(target) + deeptias = pd.read_csv(f'DeepTIAS_{type}.csv') + + merged = pd.merge(survey, deeptias, left_on=target, right_on='name', how='inner') + merged.to_csv(f'Merged_{target}.csv', index=False) + + print(merged.head(3)) + +if __name__ == "__main__": + for target in target_list: + make_data(target) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7b233bd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pandas +scikit-learn +matplotlib