首页 > 其他分享 >基于lstm+taransforner机器翻译-中藏翻译-完整代码数据

基于lstm+taransforner机器翻译-中藏翻译-完整代码数据

时间:2024-10-26 11:16:51浏览次数:3  
标签:target seq max taransforner 机器翻译 source train test lstm

项目视频讲解:

基于lstm+taransforner机器翻译-中藏翻译_哔哩哔哩_bilibili

数据展示:

 

 

# coding:utf-8
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 示例数据(重复数据以增加样本量)
# data = [ ... ] # 原始示例数据,已注释掉

# 提取源语言和目标语言句子
data=[]
with open('测试集.txt','r',encoding='utf-8') as f:
f = f.read().splitlines()
for line in f:
temp=[]
temp.append(line.split('%%')[0])
temp.append(line.split('%%')[1][:-2])
data.append(temp)
print(data)
source_texts = [pair[0] for pair in data]
target_texts = [pair[1] for pair in data]

# 训练和测试集拆分
source_train, source_test, target_train, target_test = train_test_split(source_texts, target_texts, test_size=0.2, random_state=42)

# 对源语言进行tokenizer
source_tokenizer = Tokenizer()
source_tokenizer.fit_on_texts(source_train)
source_train_seq = source_tokenizer.texts_to_sequences(source_train)
source_test_seq = source_tokenizer.texts_to_sequences(source_test)

# 对目标语言进行tokenizer
target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(target_train)
target_train_seq = target_tokenizer.texts_to_sequences(target_train)
target_test_seq = target_tokenizer.texts_to_sequences(target_test)

# 填充序列
max_source_length = max(len(seq) for seq in source_train_seq)
max_target_length = max(len(seq) for seq in target_train_seq)
source_train_seq = pad_sequences(source_train_seq, maxlen=max_source_length, padding='post')
print('source_train_seq.shape',source_train_seq.shape)
source_test_seq = pad_sequences(source_test_seq, maxlen=max_source_length, padding='post')
print('source_train_seq.shape',source_train_seq.shape)
target_train_seq = pad_sequences(target_train_seq, maxlen=max_target_length, padding='post')
target_test_seq = pad_sequences(target_test_seq, maxlen=max_target_length, padding='post')

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional, LayerNormalization, MultiHeadAttention

# 定义模型输入
encoder_inputs = Input(shape=(max_source_length,), name='encoder_inputs')
decoder_inputs = Input(shape=(max_target_length,), name='decoder_inputs')

# 编码器
embedding_dim = 512
encoder_embedding = Embedding(input_dim=len(source_tokenizer.word_index) + 1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(embedding_dim, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
encoder_states = [forward_h, forward_c, backward_h, backward_c]

# 解码器
decoder_embedding = Embedding(input_dim=len(target_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_states[0], encoder_states[2]])

# Transformer
attention_layer = MultiHeadAttention(num_heads=4, key_dim=embedding_dim)
attention_output = attention_layer(query=decoder_outputs, key=encoder_outputs, value=encoder_outputs)
attention_output = LayerNormalization(epsilon=1e-6)(attention_output + decoder_outputs)

# 时间分布的全连接层
decoder_dense = TimeDistributed(Dense(len(target_tokenizer.word_index) + 1, activation='softmax'))
decoder_outputs = decoder_dense(attention_output)

# 定义模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 编译模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 打印模型摘要
model.summary()

# 目标数据处理
target_train_seq_input = np.zeros((len(target_train_seq), max_target_length))
target_train_seq_output = np.zeros((len(target_train_seq), max_target_length))

target_train_seq_input[:, 0:max_target_length-1] = target_train_seq[:, 0:max_target_length-1] # 输入序列
target_train_seq_output[:, 0:max_target_length-1] = target_train_seq[:, 1:max_target_length] # 输出序列

target_test_seq_input = np.zeros((len(target_test_seq), max_target_length))
target_test_seq_output = np.zeros((len(target_test_seq), max_target_length))

target_test_seq_input[:, 0:max_target_length-1] = target_test_seq[:, 0:max_target_length-1]
target_test_seq_output[:, 0:max_target_length-1] = target_test_seq[:, 1:max_target_length]

# 调整输出数据形状以匹配sparse_categorical_crossentropy
target_train_seq_output = np.expand_dims(target_train_seq_output, -1)
target_test_seq_output = np.expand_dims(target_test_seq_output, -1)
print(target_train_seq_output.shape)

print('source_train_seq',source_train_seq)
print('target_train_seq_input',target_train_seq_input)
# print('target_train_seq_output',target_train_seq_output)

# 训练模型并保存模型
history = model.fit(
[source_train_seq, target_train_seq_input],
target_train_seq_output,
validation_data=([source_test_seq, target_test_seq_input], target_test_seq_output),
batch_size=32,
epochs=100
)

# 保存模型
model.save('translation_model.h5')

# 加载模型进行预测
loaded_model = tf.keras.models.load_model('translation_model.h5', custom_objects={'MultiHeadAttention': MultiHeadAttention, 'LayerNormalization': LayerNormalization})
import numpy as np

def translate_sentence(input_sentence, source_tokenizer, target_tokenizer, model, max_source_length, max_target_length):
# 对输入句子进行tokenization和padding
input_seq = source_tokenizer.texts_to_sequences([input_sentence])
input_seq = pad_sequences(input_seq, maxlen=max_source_length, padding='post')

# 准备解码器的输入
target_seq = np.zeros((1, max_target_length))

# 存储生成的翻译结果
translated_sentence = ''
for i in range(max_target_length - 1):
output_tokens = model.predict([input_seq, target_seq])
sampled_token_index = np.argmax(output_tokens[0, i, :])
if (sampled_token_index==0) and (i==0):
sampled_token_index=sampled_token_index+1
sampled_token = target_tokenizer.index_word.get(sampled_token_index, '')
translated_sentence += ' ' + sampled_token

target_seq[0, i + 1] = sampled_token_index
return translated_sentence.strip()

# 评估模型
test_loss, test_accuracy = loaded_model.evaluate([source_test_seq, target_test_seq_input], target_test_seq_output)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

# 示例推理
input_sentence = "在 中国共产党 第二十次 全国代表大会 上 的 报告"
translated_sentence = translate_sentence(input_sentence, source_tokenizer, target_tokenizer, loaded_model, max_source_length, max_target_length)
print(f'Input Sentence: {input_sentence}')
print(f'Translated Sentence: {translated_sentence}')

# coding:utf-8
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 示例数据(重复数据以增加样本量)
# data = [ ... ] # 原始示例数据,已注释掉

# 提取源语言和目标语言句子
data=[]
with open('测试集.txt','r',encoding='utf-8') as f:
    f = f.read().splitlines()
    for line in f:
        temp=[]
        temp.append(line.split('%%')[0])
        temp.append(line.split('%%')[1][:-2])
        data.append(temp)
print(data)
source_texts = [pair[0] for pair in data]
target_texts = [pair[1] for pair in data]

# 训练和测试集拆分
source_train, source_test, target_train, target_test = train_test_split(source_texts, target_texts, test_size=0.2, random_state=42)

# 对源语言进行tokenizer
source_tokenizer = Tokenizer()
source_tokenizer.fit_on_texts(source_train)
source_train_seq = source_tokenizer.texts_to_sequences(source_train)
source_test_seq = source_tokenizer.texts_to_sequences(source_test)

# 对目标语言进行tokenizer
target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(target_train)
target_train_seq = target_tokenizer.texts_to_sequences(target_train)
target_test_seq = target_tokenizer.texts_to_sequences(target_test)

# 填充序列
max_source_length = max(len(seq) for seq in source_train_seq)
max_target_length = max(len(seq) for seq in target_train_seq)
source_train_seq = pad_sequences(source_train_seq, maxlen=max_source_length, padding='post')
print('source_train_seq.shape',source_train_seq.shape)
source_test_seq = pad_sequences(source_test_seq, maxlen=max_source_length, padding='post')
print('source_train_seq.shape',source_train_seq.shape)
target_train_seq = pad_sequences(target_train_seq, maxlen=max_target_length, padding='post')
target_test_seq = pad_sequences(target_test_seq, maxlen=max_target_length, padding='post')

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional, LayerNormalization, MultiHeadAttention

# 定义模型输入
encoder_inputs = Input(shape=(max_source_length,), name='encoder_inputs')
decoder_inputs = Input(shape=(max_target_length,), name='decoder_inputs')

# 编码器
embedding_dim = 512
encoder_embedding = Embedding(input_dim=len(source_tokenizer.word_index) + 1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(embedding_dim, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
encoder_states = [forward_h, forward_c, backward_h, backward_c]

# 解码器
decoder_embedding = Embedding(input_dim=len(target_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_states[0], encoder_states[2]])

# Transformer
attention_layer = MultiHeadAttention(num_heads=4, key_dim=embedding_dim)
attention_output = attention_layer(query=decoder_outputs, key=encoder_outputs, value=encoder_outputs)
attention_output = LayerNormalization(epsilon=1e-6)(attention_output + decoder_outputs)

# 时间分布的全连接层
decoder_dense = TimeDistributed(Dense(len(target_tokenizer.word_index) + 1, activation='softmax'))
decoder_outputs = decoder_dense(attention_output)

# 定义模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 编译模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 打印模型摘要
model.summary()

# 目标数据处理
target_train_seq_input = np.zeros((len(target_train_seq), max_target_length))
target_train_seq_output = np.zeros((len(target_train_seq), max_target_length))

target_train_seq_input[:, 0:max_target_length-1] = target_train_seq[:, 0:max_target_length-1]  # 输入序列
target_train_seq_output[:, 0:max_target_length-1] = target_train_seq[:, 1:max_target_length]  # 输出序列

target_test_seq_input = np.zeros((len(target_test_seq), max_target_length))
target_test_seq_output = np.zeros((len(target_test_seq), max_target_length))

target_test_seq_input[:, 0:max_target_length-1] = target_test_seq[:, 0:max_target_length-1]
target_test_seq_output[:, 0:max_target_length-1] = target_test_seq[:, 1:max_target_length]

# 调整输出数据形状以匹配sparse_categorical_crossentropy
target_train_seq_output = np.expand_dims(target_train_seq_output, -1)
target_test_seq_output = np.expand_dims(target_test_seq_output, -1)
print(target_train_seq_output.shape)

print('source_train_seq',source_train_seq)
print('target_train_seq_input',target_train_seq_input)
# print('target_train_seq_output',target_train_seq_output)

# 训练模型并保存模型
history = model.fit(
    [source_train_seq, target_train_seq_input],
    target_train_seq_output,
    validation_data=([source_test_seq, target_test_seq_input], target_test_seq_output),
    batch_size=32,
    epochs=100
)

# 保存模型
model.save('translation_model.h5')

# 加载模型进行预测
loaded_model = tf.keras.models.load_model('translation_model.h5', custom_objects={'MultiHeadAttention': MultiHeadAttention, 'LayerNormalization': LayerNormalization})
import numpy as np

def translate_sentence(input_sentence, source_tokenizer, target_tokenizer, model, max_source_length, max_target_length):
    # 对输入句子进行tokenization和padding
    input_seq = source_tokenizer.texts_to_sequences([input_sentence])
    input_seq = pad_sequences(input_seq, maxlen=max_source_length, padding='post')

    # 准备解码器的输入
    target_seq = np.zeros((1, max_target_length))

    # 存储生成的翻译结果
    translated_sentence = ''
    for i in range(max_target_length - 1):
        output_tokens = model.predict([input_seq, target_seq])
        sampled_token_index = np.argmax(output_tokens[0, i, :])
        if (sampled_token_index==0) and (i==0):
            sampled_token_index=sampled_token_index+1
        sampled_token = target_tokenizer.index_word.get(sampled_token_index, '')
        translated_sentence += ' ' + sampled_token

        target_seq[0, i + 1] = sampled_token_index
    return translated_sentence.strip()

# 评估模型
test_loss, test_accuracy = loaded_model.evaluate([source_test_seq, target_test_seq_input], target_test_seq_output)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

# 示例推理
input_sentence = "在 中国共产党 第二十次 全国代表大会 上 的 报告"
translated_sentence = translate_sentence(input_sentence, source_tokenizer, target_tokenizer, loaded_model, max_source_length, max_target_length)
print(f'Input Sentence: {input_sentence}')
print(f'Translated Sentence: {translated_sentence}')

完整代码数据:

https://download.csdn.net/download/qiqi_ai_/89923908

标签:target,seq,max,taransforner,机器翻译,source,train,test,lstm
From: https://blog.csdn.net/qq_38735017/article/details/143250246

相关文章