import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pca_ccr = 0.98 # 主成分分析の累積寄与率の閾値
n_folds = 5 # クロスバリデーションの分割数
df = pd.DataFrame(
columns=['DataType','Input', 'Target', 'Model',
'PCAdim', 'Accuracy', 'Recall', 'Precision', 'F1', 'AUC']
)
df_index = 0
def make_dataset():
"""
データセットを作成する関数
"""
# print("\nPreparing dataset...")
all_data = pd.read_csv(f'Merged_{df.loc[df_index, 'DataType']}App.csv')
input = df.loc[df_index, 'Input'] # 入力の種類を取得
if input == "shape":
# print("Using shape features...")
x = all_data.loc[:, 'shape-width':'shape-bottomRightY']
elif input == "color":
# print("Using color features...")
x = all_data.loc[:, 'chiu-lateral-L-min':'fiveClick-tip-b-kurtosis']
elif input == "texture":
# print("Using texture features...")
x = all_data.loc[:, 'chiu-lateral-contrast':'fiveClick-tip-correlation']
else:
# print("Using all features...")
x = all_data.loc[:, 'shape-width':'fiveClick-tip-b-kurtosis'] # 全データ
scores = all_data.loc[:, 'A01':'C08'] # スコア
invert_list = ['A01', 'A02', 'A03']
for invert in invert_list:
scores[invert] = 5 - scores[invert] # 逆転スコア
scores['A'] = scores.loc[:, 'A01':'A10'].sum(axis=1) # A群
scores['B'] = scores.loc[:, 'B07':'B29'].sum(axis=1) # B群
scores['C'] = scores.loc[:, 'C01':'C08'].sum(axis=1) # C群
scores['Total'] = scores.loc[:, 'A':'C'].sum(axis=1) # 総合スコア
target = df.loc[df_index, 'Target'] # ターゲットスコア
threshold = scores[target].median() # 中央値を閾値とする
# print(f'Threshold for {target}: {threshold}')
# scores['Total'].plot.hist(bins=20, edgecolor='black') # ヒストグラムを描画
# import matplotlib.pyplot as plt
# plt.title('Total Score Distribution')
# plt.xlabel('Total Score')
# plt.ylabel('Frequency')
# plt.show() # ヒストグラムを表示
scores['label'] = 0
scores.loc[scores[target] >= threshold, 'label'] = 1 # ラベル付け
# print(scores.head(3))
return x, scores['label']
def cross_val(model, x, y):
"""
クロスバリデーションを行う関数
"""
# print("\nStart cross-validation")
pca_dim = calc_pca_dim(x)
df.loc[df_index, 'PCAdim'] = pca_dim # 主成分分析の次元数を保存
kfold_cv = KFold(n_splits=n_folds, shuffle=True)
pipe = Pipeline([("scaler", StandardScaler()),
("pca", PCA(n_components=pca_dim)),
("model", model)])
df.loc[df_index, 'Accuracy'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="accuracy").mean()
df.loc[df_index, 'Recall'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="recall").mean()
df.loc[df_index, 'Precision'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="precision").mean()
df.loc[df_index, 'F1'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="f1").mean()
df.loc[df_index, 'AUC'] = cross_val_score(pipe, x, y, cv=kfold_cv, scoring="roc_auc").mean()
# print(f"Accuracy: {accuracy:.3f}, recall: {recall:.3f}, precision: {precision:.3f}, F1: {F1:.3f}, AUC: {auc:.3f}")
def calc_pca_dim(x):
"""
主成分分析の次元数を計算する関数
"""
# print("\nCalculating PCA dimensions...")
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
pca = PCA(n_components=None) # 主成分分析の次元数を入力データの次元数に設定
pca.fit(x_scaled)
ccr = 0
pca_dim = 0
for i in range(pca.n_components_):
ccr += pca.explained_variance_ratio_[i]
if ccr >= pca_ccr:
pca_dim = i + 1 # 次元数は0から始まるので1を足す
# print(f'Number of components to reach 95% variance: {pca_dim}')
break
return pca_dim
def train_predict(x, y):
"""
モデルを訓練し、予測を行う関数
"""
# データセットを訓練用とテスト用に分割
print("\nStarting model training and prediction...")
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
input_dim = x_train.shape[1]
print(f"Input dim:{input_dim} Train:{x_train.shape[0]}, Test: {x_test.shape[0]}")
pca_dim = calc_pca_dim(x_train)
pipe = Pipeline([("scaler", StandardScaler()),
("pca", PCA(n_components=pca_dim)),
("model", RandomForestClassifier())])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)
# x_test = scaler.transform(x_test)
# model = RandomForestClassifier()
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
df['accuracy'] = accuracy
print(f'Accuracy: {accuracy:.2f}')
# print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
# メイン関数
if __name__ == "__main__":
print("Starting analysis...")
data_type_list = ["SmTIAS_Phone", "SmTIAS_Web", "HandyTCC_Phone", "HandyTCC_Web"] # データセットの種類
input_list = ["shape", "color", "texture", "all"] # 入力の種類
target_list = ["A", "B", "C", "Total"] # ターゲットスコア
model_list = [RandomForestClassifier(),
AdaBoostClassifier(),
SVC(),] # モデルの種類
counts = len(data_type_list) * len(input_list) * len(target_list) * len(model_list)
for data_type in data_type_list:
for input in input_list: # 入力の種類を指定
for target in target_list:
for model in model_list:
model_name = model.__class__.__name__.replace("Classifier", "") # モデル名から"Classifier"を削除
print(f"\nAnalyzing {df_index+1}/{counts} {data_type}, {input}, {target}, {model_name}")
df.loc[df_index,:] = [data_type, input, target, model_name, 0, 0, 0, 0, 0, 0] # 初期化
x, y = make_dataset() # 入力とターゲットを指定
# train_predict(x, y) # モデルの訓練と予測を実行
cross_val(model, x, y) # クロスバリデーションを実行
df_index += 1
print("\nAnalysis complete.")
print(df) # 結果を表示
df.to_csv('analysis_results.csv', index=False) # 結果をCSVファイルに保存