1 minute read

F1 스코어

정밀도와 재현율을 결합한 지표, 정밀도와 재현율이 어느 한쪽으로 치우치지 않는 수치를 나타낼 때 상대적으로 높은 값을 가짐

$F1 = \frac{2}{\frac{1}{recall} + \frac{1}{precision}}$ $=$ $2 * \frac{precision * recall}{precision + recall}$

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size = 0.2, random_state=120)

lr_clf = LogisticRegression()

lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
from sklearn.metrics import f1_score
f1 = f1_score(y_test, pred)
print('F1 스코어: {0:.4f}'.format(f1))
F1 스코어: 0.9583
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test, pred_proba_c1, thresholds) :
    
    # thresholds list 객체 내의 값을 차례로 iteration하면서 Evaluation 수행.
    for custom_threshold in thresholds :
        binarizer = Binarizer(threshold = custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임계값:', custom_threshold)
        get_clf_eval(y_test, custom_predict)
    
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    
    # F1 스코어 추가
    f1 = f1_score(y_test, pred)
    
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, \
          F1:{3:.4f}'.format(accuracy, precision, recall, f1))

thresholds = [0.2, 0.35, 0.5, 0.65, 0.8]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:, 1].reshape(-1, 1), thresholds)
임계값: 0.2
오차 행렬
[[37  6]
 [ 1 70]]
정확도: 0.9386, 정밀도: 0.9211, 재현율: 0.9859,           F1:0.9524
임계값: 0.35
오차 행렬
[[37  6]
 [ 2 69]]
정확도: 0.9298, 정밀도: 0.9200, 재현율: 0.9718,           F1:0.9452
임계값: 0.5
오차 행렬
[[39  4]
 [ 2 69]]
정확도: 0.9474, 정밀도: 0.9452, 재현율: 0.9718,           F1:0.9583
임계값: 0.65
오차 행렬
[[39  4]
 [ 3 68]]
정확도: 0.9386, 정밀도: 0.9444, 재현율: 0.9577,           F1:0.9510
임계값: 0.8
오차 행렬
[[40  3]
 [ 7 64]]
정확도: 0.9123, 정밀도: 0.9552, 재현율: 0.9014,           F1:0.9275

Comments