1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
| import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.datasets import load_wine
column_names = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None, names=column_names)
print("=== 数据预览 ===") print("前5行数据:") print(df.head()) print("\n后5行数据:") print(df.tail()) print(f"\n数据集形状:{df.shape}") print(f"类别标签:{np.unique(df['Class label'])}")
print("\n=== 统计分析 ===") print("整体统计描述:") print(df.describe())
print("\n按类别统计描述:") print(df.groupby('Class label').describe())
X = df.iloc[:, 1:].values y = df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
print("\n=== 数据集划分结果 ===") print(f"训练集大小:{X_train.shape}") print(f"测试集大小:{X_test.shape}")
train_counts = np.unique(y_train, return_counts=True) test_counts = np.unique(y_test, return_counts=True) print(f"训练集类别分布:{dict(zip(train_counts[0], train_counts[1]))}") print(f"测试集类别分布:{dict(zip(test_counts[0], test_counts[1]))}")
|