import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer
import torchtext
from torchtext.vocab import build_vocab_from_iterator
# 使用torchtext加载IMDB数据集
from torchtext.datasets import IMDB
# 使用torchtext的tokenizer(basic_english分词器)
tokenizer = get_tokenizer('basic_english')
# 词汇表构建
specials = ["<unk>", "<pad>", "<bos>", "<eos>"]
def yield_tokens(sentences):
for sentence in sentences:
yield tokenizer(sentence)
# 下载并加载IMDB数据集
train_iter, test_iter = IMDB(split=('train', 'test'))
# 构建词汇表
vocab = build_vocab_from_iterator(yield_tokens([text for (text, label) in train_iter]), specials=specials, min_freq=1)
vocab.set_default_index(vocab["<unk>"])
# 加载GloVe词向量
glove = GloVe(name='6B', dim=100)
vocab.set_vectors(glove.stoi, glove.vectors, glove.dim)
# 定义文本处理函数,将文本转化为整数索引
def text_to_indices(texts, vocab, tokenizer):
return [
[vocab[token] if token in vocab else vocab["<unk>"] for token in tokenizer(text)]
for text in texts
]
# 从训练集和测试集提取文本和标签
train_sentences, train_labels = zip(*[(text, label) for text, label in train_iter])
test_sentences,
标签:vocab,tokenizer,text,torchtext,Long,情感,train,影评,import
From: https://blog.csdn.net/max500600/article/details/145006133