import numpy as np

import pandas as pd
fish = pd.read_csv('https://bit.ly/fish_csv')

fish.head()

# class들 확인
print(pd.unique(fish['Species']))

['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']

# 데이터 프레임을 numpy 배열로 변환
fish_input = fish[['Weight', 'Length', 'Diagonal', 'Height', 'Width']].to_numpy()
np.shape(fish_input)

(159, 5)

print(fish_input[:5])

[[242.      25.4     30.      11.52     4.02  ]
 [290.      26.3     31.2     12.48     4.3056]
 [340.      26.5     31.1     12.3778   4.6961]
 [363.      29.      33.5     12.73     4.4555]
 [430.      29.      34.      12.444    5.134 ]]

# 클래스(label)에 해당하는 데이터 프레임을 numpy 배열로 변환
fish_target = fish['Species'].to_numpy()
np.shape(fish_target)

(159,)

from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(fish_input, fish_target, random_state=42)

np.shape(train_input)

(119, 5)

# 데이터 스케일링(데이터의 범위 조절)
from sklearn.preprocessing import StandardScaler

ss= StandardScaler()
ss.fit(train_input)

StandardScaler(copy=True, with_mean=True, with_std=True)

train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

# 로지스틱 회귀는 기본적으로 반복적인 알고리즘 : 충분한 학습을 위해 max_iter=1000으로 설정
# Logistic regression은 기본적을 릿지 회귀와 같이 기본적으로 계수의 제곱을 규제함 -> 이걸 L2규제라고 부름
# 릿지에서의 alpha처럼 L2 규제에서는 C가 그 역할을 함 - 하지만, C는 alpha와는 반대로 값이 작을 수록 규제가 커짐(default 값은 1)
# 여기서는 규제 완화를 위해 20으로 설정
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, train_target)

# 5개의 데이터가 어떤 클래스로 예측이 되었는지 확인
print(lr.predict(test_scaled[:5]))

['Perch' 'Smelt' 'Pike' 'Perch' 'Perch']

#5개의 데이터가 각 클래스가 정답이 될 확률 확인 (데이터는 5개, 클래스는 2개이므로 5x2배열 출력)
print(lr.predict_proba(test_scaled[:5]))

[[3.22915101e-05 4.37401160e-02 6.95724508e-01 1.45386445e-03
  2.26750523e-01 1.98613907e-02 1.24373065e-02]
 [1.03941043e-06 4.71536962e-02 1.22520324e-01 3.80160397e-03
  5.07737869e-02 7.74480348e-01 1.26920164e-03]
 [4.23756249e-04 3.47384633e-06 3.91190806e-02 7.91641519e-01
  7.34093289e-02 9.29110051e-02 2.49183676e-03]
 [1.06398227e-02 1.04868641e-02 5.87049814e-01 9.89500397e-03
  3.35228324e-01 1.45209995e-03 4.52480711e-02]
 [3.57676414e-05 1.05861570e-03 6.96341736e-01 1.25434626e-02
  2.60820811e-01 1.41335046e-02 1.50661026e-02]]

# 위에서 출력된 값의 각 열이 어떤 클래스를 나타내는지 확인
print(lr.classes_)

# 0='Bream', 1='Smelt' - 음성=Bream, 양성=Smelt

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']

# 5개의 특성을 사용하기 때문에 coef의 배열의 열은 5개이다.
# 클래스가 7개이므로 행이 7이다. 즉, 다중분류는 클래스마다 z값을 하나씩 계산한다. -> 가장 높은 z값을 가지는 클래스가 예측 클래스가 됨 
print(lr.coef_.shape, lr.intercept_.shape)

(7, 5) (7,)

#각 클래스에 대한 z값을 반환 : z1~z7값 구하기
decision = lr.decision_function(test_scaled[:5])
print(np.round(decision, decimals=2))

[[-10.23  -2.97   1.25  -6.42  -1.08  -3.79  -4.26]
 [-13.66  -2.88  -1.83  -5.45  -2.8    1.91  -6.55]
 [ -7.62 -12.43  -3.05   2.35  -2.38  -2.12  -5.85]
 [ -5.34  -5.35  -1.03  -5.41  -1.73  -7.33  -3.87]
 [-10.16  -6.77   1.1   -4.29  -0.94  -4.17  -4.1 ]]

# 사이파이 라이브러리에 있는 소프트맥스 함수를 사용하여 각 샘플(각 행)에 대해 소프트맥스 확률을 계산
from scipy.special import softmax
proba = softmax(decision, axis=1)
print(np.round(proba, decimals=3))

[[0.    0.013 0.89  0.    0.087 0.006 0.004]
 [0.    0.008 0.023 0.001 0.009 0.96  0.   ]
 [0.    0.    0.004 0.976 0.009 0.011 0.   ]
 [0.008 0.008 0.627 0.008 0.311 0.001 0.036]
 [0.    0.    0.873 0.004 0.113 0.004 0.005]]

하이퍼 파라미터(hyperparameter)란? (0)	2022.02.21
검증(Validation)/ 교차검증(cross validation/K-fold cross validation) (0)	2022.02.21
[분류 알고리즘] 로지스틱 회귀(Logistic Regression) - 이진분류(binary classification) (0)	2022.02.17
데이터 분석이란? (0)	2021.12.14
스칼라(scalar), 벡터(Vector), 행렬(Matrix), 텐서(Tensor)란? (0)	2021.12.11

그래도해야지어떡해

그래도해야지어떡해

[분류 알고리즘] 로지스틱 회귀(Logistic Regression) - 다중분류(multi-class classification) 본문

[분류 알고리즘] 로지스틱 회귀(Logistic Regression) - 다중분류(multi-class classification)

'ML' 카테고리의 다른 글

티스토리툴바

	Species	Weight	Length	Diagonal	Height	Width
0	Bream	242.0	25.4	30.0	11.5200	4.0200
1	Bream	290.0	26.3	31.2	12.4800	4.3056
2	Bream	340.0	26.5	31.1	12.3778	4.6961
3	Bream	363.0	29.0	33.5	12.7300	4.4555
4	Bream	430.0	29.0	34.0	12.4440	5.1340

« 2025/09 »
일	월	화	수	목	금	토
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30