本次实验基于新闻组数据集(20 newsgroups),该数据集将自动下载。通过指定类别名称或设置为None,可以调整数据集中的类别数量,以获取全部20个类别。数据集包含2823个文档,涵盖5个类别。
使用了三种不同的分类器对数据集进行了训练和测试,包括全监督的SGD分类器、部分监督的SGD分类器以及自训练分类器和标签传播分类器。以下是各分类器在不同数据集上的F1分数。
在100%的数据上训练的SGD分类器,训练样本数量为2117,测试集上的微观平均F1分数为0.885。
在20%的数据上训练的SGD分类器,训练样本数量为411,测试集上的微观平均F1分数为0.773。
在20%的训练数据上应用自训练分类器(其余为未标记数据),训练样本数量为2117,未标记样本数量为1706。经过10次迭代,分类器在测试集上的微观平均F1分数为0.834。
在20%的数据上应用标签传播分类器(其余为未标记数据),训练样本数量为2117,未标记样本数量为1706。测试集上的微观平均F1分数为0.644。
以下是使用Python和scikit-learn库实现上述分类器的代码示例。
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import LabelSpreading, SelfTrainingClassifier
# 加载数据集,包含前五个类别
data = fetch_20newsgroups(subset="train", categories=["alt.atheism", "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"])
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()
# 参数设置
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss")
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)
# 全监督SGD分类器流水线
pipeline = Pipeline([
("vect", CountVectorizer(**vectorizer_params)),
("tfidf", TfidfTransformer()),
("clf", SGDClassifier(**sdg_params)),
])
# 自训练分类器流水线
st_pipeline = Pipeline([
("vect", CountVectorizer(**vectorizer_params)),
("tfidf", TfidfTransformer()),
("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
])
# 标签传播分类器流水线
ls_pipeline = Pipeline([
("vect", CountVectorizer(**vectorizer_params)),
("tfidf", TfidfTransformer()),
("toarray", FunctionTransformer(lambda x: x.toarray())),
("clf", LabelSpreading()),
])
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
print("Number of training samples:", len(X_train))
print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Micro-averaged F1 score on test set: %0.3f" % f1_score(y_test, y_pred, average="micro"))
print("-" * 10)
print()
if __name__ == "__main__":
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
print("Supervised SGDClassifier on 100% of the data:")
eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)
# 选择20%的训练数据集的掩码
y_mask = np.random.rand(len(y_train)) < 0.2
# X_20和y_20是由掩码指示的训练数据集的子集
X_20, y_20 = map(list, zip(*((x, y) for x, y, m in zip(X_train, y_train, y_mask) if m)))
print("Supervised SGDClassifier on 20% of the training data:")
eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)
# 将未掩码的子集设置为未标记
y_train[~y_mask] = -1
print("SelfTrainingClassifier on 20% of the training data (rest is unlabeled):")
eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)
print("LabelSpreading on 20% of the data (rest is unlabeled):")
eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)
# 脚本总运行时间:(0分钟6.589秒)