人工智能例子汇总:AI常见的算法和例子-CSDN博客
在PyTorch中实现中文情感分析算法通常涉及以下几个步骤:数据预处理、模型定义、训练和评估。下面是一个简单的实现示例,使用LSTM模型进行中文情感分析。
1. 数据预处理
首先,我们需要对中文文本进行分词,并将文本转换为数值形式(如词向量)。可以使用jieba
进行分词,并使用torchtext
或自定义的词汇表将词语转换为索引。
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import jieba
# 示例数据
data = [
("我非常喜欢这个电影", "positive"),
("这个电影太糟糕了", "negative"),
("这部电影真的很棒", "positive"),
("我不喜欢这个电影", "negative"),
("这部电影让我感动", "positive"),
("这部电影太无聊了", "negative"),
("演员表演非常出色", "positive"),
("剧情太差了", "negative"),
("画面非常精美", "positive"),
("完全不值得看", "negative")
]
# 分词函数
def tokenize(text):
return list(jieba.cut(text))
# 构建词汇表
tokenizer = get_tokenizer(tokenize)
vocab = build_vocab_from_iterator(map(tokenizer, [text for text, label in data]), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
# 将文本转换为索引
def text_to_indices(text):
return [vocab[token] for token in tokenizer(text)]
# 将标签转换为数值
label_to_index = {"positive": 1, "negative": 0}
# 预处理数据
processed_data = [(text_to_indices(text), label_to_index[label]) for text, label in data]
# 定义LSTM模型
class LSTMModel(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
super(LSTMModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional,
dropout=dropout)
self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.dropout(self.embedding(text)) # [sequence_length, batch_size, embedding_dim]
output, (hidden, cell) = self.lstm(embedded)
hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)) # [batch_size, hidden_dim * 2]
return self.fc(hidden) # [batch_size, output_dim]
# 超参数
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
# 初始化模型
model = LSTMModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
# 损失函数和优化器
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())
# 训练函数
def train(model, data, optimizer, criterion, epochs=10):
model.train()
for epoch in range(epochs):
total_loss = 0
for text, label in data:
text = torch.tensor(text).unsqueeze(1) # [sequence_length, batch_size=1]
label = torch.tensor([label], dtype=torch.float32) # [batch_size=1]
optimizer.zero_grad()
predictions = model(text).squeeze(0) # [batch_size=1]
loss = criterion(predictions, label)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f'Epoch: {epoch + 1}, Loss: {total_loss / len(data)}')
# 训练模型
train(model, processed_data, optimizer, criterion, epochs=20)
# 预测函数
def predict_sentiment(model, sentence):
model.eval()
with torch.no_grad():
text = torch.tensor(text_to_indices(sentence)).unsqueeze(1) # [sequence_length, batch_size=1]
prediction = torch.sigmoid(model(text).squeeze(0)) # [batch_size=1]
return "positive" if prediction.item() > 0.5 else "negative"
# 测试模型
test_sentences = [
"这个电影真的很棒",
"这部电影太无聊了",
"演员表演非常出色",
"完全不值得看"
]
for sentence in test_sentences:
print(f'Sentence: {sentence}, Predicted sentiment: {predict_sentiment(model, sentence)}')
-
数据预处理:
-
使用
jieba
对中文文本进行分词。 -
使用
torchtext
构建词汇表,并将文本转换为索引。 -
将标签转换为数值(
positive
为1,negative
为0)。
-
-
模型定义:
-
使用 LSTM 模型进行情感分析。
-
模型包括嵌入层、LSTM 层和全连接层。
-
-
训练:
-
使用二元交叉熵损失函数(
BCEWithLogitsLoss
)和 Adam 优化器。 -
训练模型 20 个 epoch。
-
-
预测:
-
使用训练好的模型对新的句子进行情感预测。
-