编码与哑变量

发布时间 2023-04-08 21:12:58作者: ThankCAT

处理缺失值

import pandas as pd
import numpy as np
df = pd.read_csv("./Narrativedata.csv", index_col=0)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
df["Age"].fillna(df["Age"].mean(), inplace=True)
df.dropna(axis=0, inplace=True)
df.shape
(889, 4)

preprocessing.LabelEncoder:标签专用,能够将分类转换为分类数值

from sklearn.preprocessing import LabelEncoder
label = df.iloc[:,-1]
le = LabelEncoder()
le_data = le.fit_transform(label)
df["Survived"] = le_data
df.shape
(889, 4)
pd.DataFrame(le.inverse_transform(le_data)).head()
0
0 No
1 Yes
2 Yes
3 Yes
4 No
le.classes_
array(['No', 'Unknown', 'Yes'], dtype=object)

preprocessing.OrdinalEncoder:特征专用,能够将分类特征转换为分类数值

from sklearn.preprocessing import OrdinalEncoder
df_ = df.copy(deep=True)
feature = df_.iloc[:,1:-1]
ordinal = OrdinalEncoder(categories="auto")
result = ordinal.fit_transform(feature)
ordinal.categories_
[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]
ordinal.feature_names_in_
array(['Sex', 'Embarked'], dtype=object)
df.shape
(889, 4)

preprocessing.OneHotEncoder:独热编码,创建哑变量

from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(categories="auto").fit(df.iloc[:,1:-1])
result = onehot.transform(df.iloc[:,1:-1]).toarray()
result
array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])
onehot.get_feature_names_out()
array(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype=object)
features = pd.DataFrame(result)
features.shape
(889, 5)
data_full = pd.concat([df,pd.DataFrame(result)],axis=1)
# 搞不懂了
data_full.dropna(axis=0,inplace=True)
data_full.head()
Age Sex Embarked Survived 0 1 2 3 4
0 22.0 male S 0.0 0.0 1.0 0.0 0.0 1.0
1 38.0 female C 2.0 1.0 0.0 1.0 0.0 0.0
2 26.0 female S 2.0 1.0 0.0 0.0 0.0 1.0
3 35.0 female S 2.0 1.0 0.0 0.0 0.0 1.0
4 35.0 male S 0.0 0.0 1.0 0.0 0.0 1.0
data_full.drop(axis=1, columns=["Sex","Embarked"], inplace=True)
data_full.columns = ["Age", "Survived", 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
data_full.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 887 entries, 0 to 888
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         887 non-null    float64
 1   Survived    887 non-null    float64
 2   Sex_female  887 non-null    float64
 3   Sex_male    887 non-null    float64
 4   Embarked_C  887 non-null    float64
 5   Embarked_Q  887 non-null    float64
 6   Embarked_S  887 non-null    float64
dtypes: float64(7)
memory usage: 55.4 KB