import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
data = pd.read_csv('Datasets/StudentScore.csv')
target = 'MathScore'
x = data.drop(data[[target, 'Unnamed: 0']], axis=1)
y = data[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
# numerical processing
num_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
x_train[['ReadingScore', 'WritingScore']] = num_transformer.fit_transform(x_train[['ReadingScore', 'WritingScore']])
x_test[['ReadingScore', 'WritingScore']] = num_transformer.transform(x_test[['ReadingScore', 'WritingScore']])
# ordinal processing
education_levels = ["high school", "some high school", "some college", "associate's degree", "bachelor's degree",
"master's degree"]
ord_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OrdinalEncoder(categories=[education_levels])),
])
x_train[['ParentEduc']] = ord_transformer.fit_transform(x_train[['ParentEduc']])
x_test[['ParentEduc']] = ord_transformer.transform(x_test[['ParentEduc']])
# nominal processing
nom_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder())
])
x_train[['EthnicGroup']] = nom_transformer.fit_transform(x_train[['EthnicGroup']])
x_test[['EthnicGroup']] = nom_transformer.transform(x_test[['EthnicGroup']])
# boolean processing
bool_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(sparse_output=False)),
])
x_train[['Gender', 'LunchType', 'TestPrep']] = bool_transformer.fit_transform(
x_train[['Gender', 'LunchType', 'TestPrep']])
x_test[['Gender', 'LunchType', 'TestPrep']] = bool_transformer.transform(x_train[['Gender', 'LunchType', 'TestPrep']])
我在尝试创建管道来处理布尔分类特征时遇到错误。具体来说,在训练和测试集中的特征的
fit_transform
步骤中,我收到了
ValueError:列必须与标称处理和布尔处理部分中的键长度相同
如何解决这个问题?
错误信息“ValueError: 列必须与标称处理和布尔处理部分中的键长度相同”表明在
fit_transform
和
transform
的训练数据和测试数据之间传递了不同数量的列。
在的代码中,问题出现在这些行中:
x_test[['Gender', 'LunchType', 'TestPrep']] = bool_transformer.transform(x_train[['Gender', 'LunchType', 'TestPrep']])
错误地将
x_train
传递给
transform
方法,而不是
x_test
。这会导致列不匹配,因为
transform
方法期望看到与它在
fit_transform
期间看到的数据集相同的列数。
要解决此问题,请更正代码中的错字,以便它将
x_test
传递给
transform
方法:
x_test[['Gender', 'LunchType', 'TestPrep']] = bool_transformer.transform(x_test[['Gender', 'LunchType', 'TestPrep']])
通过此更改,的代码应在训练和测试数据上使用正确数量的列运行,而不会出现任何错误。
标签:python,machine-learning,scikit-learn From: 78813351