加载数据

以IMDB影评数据为例,加载评论

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow import data
import warnings

warnings.filterwarnings('ignore')

读取影评数据

df_train = pd.read_csv(r'D:\WebDownload\IMDB影评数据集\labeledTrainData.tsv',sep=r'\t')
df_test = pd.read_csv(r'D:\WebDownload\IMDB影评数据集\testData.tsv',sep=r'\t')

处理数据,testData数据需要将评分提取处理来,然后将大于等于7的设置为积极表示,最后将评论中的<br />替换掉

df_test['sentiment']=df_test.id 
df_test.sentiment = df_test.sentiment.map(lambda x: x.replace("\"","").split('_')[1])
df_test.sentiment = df_test.sentiment.map(lambda x: 1 if int(x)>=7 else 0)
df_test.review=df_test.review.map(lambda x : x.replace("<br />",""))
df_train.review=df_train.review.map(lambda x : x.replace("<br />",""))

制作单词表

根据评论中的单词制作单词表,然后将评论转换为单词编码

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokernizer = Tokenizer(num_words=1000)
tokernizer.fit_on_texts(df_train.review.tolist())

df_train_review = tokernizer.texts_to_sequences(df_train.review.tolist())
df_test_review = tokernizer.texts_to_sequences(df_test.review.tolist())

训练模型

数据

将前面的数据转换为tf.data.Dataset

对于长度参差不齐的句子,人为设置一个阈值,对大于此长度的句子,选择截断部分单词,可以选择截去句首单词,也可以截去句末单词;对于小于此长度的句子,可以选择在句首或句尾填充。

db_train = data.Dataset.from_tensor_slices((pad_sequences(df_train_review,maxlen=80),df_train.sentiment))
db_test = data.Dataset.from_tensor_slices((pad_sequences(df_test_review,maxlen=80),df_test.sentiment))
db_train = db_train.shuffle(1000).batch(128,drop_remainder=True)
db_test = db_test.batch(128,drop_remainder=True)

网络模型

class MyRNN(keras.Model):
    def __init__(self,units):
        '''
        params:units RNN 状态向量长度 
        '''
        super(MyRNN,self).__init__()
        self.stat0=[tf.zeros([128,units]),tf.zeros([128,units])]
        self.stat1=[tf.zeros([128,units]),tf.zeros([128,units])]
        self.embeding = keras.layers.Embedding(1000,100,input_length=80)

        self.rnn_cell0 = keras.layers.LSTMCell(units,dropout=0.5)
        self.rnn_cell1 = keras.layers.LSTMCell(units,dropout=0.5)

        self.out_layer = keras.Sequential([
            keras.layers.Dense(units),
            keras.layers.Dropout(rate=0.5),
            keras.layers.ReLU(),
            keras.layers.Dense(1)
        ])

    def call(self,inputs,training=None):
        X = inputs
        x=self.embeding(X)
        state0 = self.stat0
        state1 = self.stat1
        for word in tf.unstack(x,axis=1):
            out0,state0 = self.rnn_cell0(word,state0,training)
            out1,state1 = self.rnn_cell0(word,state1,training)
        x = self.out_layer(out1,training)
        prob = tf.sigmoid(x)
        return prob
    
model = MyRNN(64)
model.compile(optimizer=keras.optimizers.RMSprop(0.001),loss=keras.losses.BinaryCrossentropy(),metrics=['accuracy'])
model.fit(db_train,epochs=20)

经过20轮的训练,精度达到了0.916,测试精度达到0.841

推荐使用keras的高级接口:

model = keras.Sequential([
    keras.layers.Embedding(1000,100,input_length=80),
    keras.layer.Flatten(),
    keras.layers.LSTM(64,return_sequences=True),
    keras.layers.LSTM(64),
    keras.layers.Dense(64,activation='relu'),
    keras.layers.Dropout(rate=0.5),
    keras.layers.Dense(1, activation='sigmoid'),
])
model.compile(optimizer=keras.optimizers.RMSprop(0.001),loss=keras.losses.BinaryCrossentropy(),metrics=['accuracy'])
model.fit(db_train,epochs=20)

但上面代码执行的时要注意,如果是A卡环境,不能使用GPU加速,会报错(原因大概为:tensorflow检测到有GPU,LSTM/GRU底层默认会调用CudnnRNNv2,N卡环境才有CudnnRNNv2)。

使用预训练的词向量

使用预训练的 Word Embedding 模型来得到单词的表示方法,基于预训练模型的词向量相当于迁移了整个语义空间的知识,可以有效的缩短训练时间,还可以提高性能。

加载GloVe词向量

# 加载GloVe向量
def load_glove_vectors(glove_file):
    with open(glove_file, 'r', encoding='utf-8') as file:
        words = set()
        word_to_vec_map = {}
        
        for line in file:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float32)
    
    return words, word_to_vec_map
 
words, word_to_vec_map = load_glove_vectors(r'D:\dwload\glove\glove.6B.100d.txt')
print("词汇量大小:", len(words))
print("向量维度:", word_to_vec_map['word'].shape)  # 示例输出一个单词的向量维度

构建Tokenizer

tokenizer = keras.preprocessing.text.Tokenizer(10000)
tokenizer.fit_on_texts(df_train.review.tolist())
imdn_word_index = tokenizer.word_index

from tensorflow.keras.preprocessing.sequence import pad_sequences

df_train_review = tokenizer.texts_to_sequences(df_train.review.tolist())
df_test_review = tokenizer.texts_to_sequences(df_test.review.tolist())

db_train = tf.data.Dataset.from_tensor_slices((pad_sequences(df_train_review,maxlen=100),df_train.sentiment.tolist()))
db_test = tf.data.Dataset.from_tensor_slices((pad_sequences(df_test_review,maxlen=100),df_test.sentiment.tolist()))
db_train = db_train.shuffle(1000).batch(128,drop_remainder=True)
db_test = db_test.batch(128,drop_remainder=True)

嵌入矩阵

根据GloVe向量和tokenizer的词索引,构建embedding_matrix

num_words = min(10000, len(imdn_word_index)) + 1
embedding_matrix = np.zeros((num_words, 100))  # 100是单词维度
for word, i in imdn_word_index.items():
    if i > 10000:
        break
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

模型

这里对embeding进行修改,使用预训练的词向量;设置trainable=False,表示后边的训练过程不在对embeding进行训练。

class MyRNN(keras.Model):
    def __init__(self,units):
        '''
        params:units RNN 状态向量长度 
        '''
        super(MyRNN,self).__init__()
        self.stat0=[tf.zeros([128,units]),tf.zeros([128,units])]
        self.stat1=[tf.zeros([128,units]),tf.zeros([128,units])]
        self.embeding = keras.layers.Embedding(num_words,100,
                                               weights=[embedding_matrix],
                                               input_length=100,trainable=False)

        self.rnn_cell0 = keras.layers.GRUCell(units) # 使用GRUCell
        self.rnn_cell1 = keras.layers.GRUCell(units)

        self.out_layer = keras.Sequential([
            keras.layers.Dense(units),
            keras.layers.ReLU(),
            keras.layers.Dense(1)
        ])

    def call(self,inputs,training=None):
        X = inputs
        x=self.embeding(X)
        state0 = self.stat0
        state1 = self.stat1
        for word in tf.unstack(x,axis=1):
            out0,state0 = self.rnn_cell0(word,state0,training)
            out1,state1 = self.rnn_cell0(word,state1,training)
        x = self.out_layer(out1,training)
        prob = tf.sigmoid(x)
        return prob
        
model = MyRNN(64)
model.compile(optimizer=keras.optimizers.RMSprop(0.001),loss=keras.losses.BinaryCrossentropy(),metrics=['accuracy'])
model.fit(db_train,epochs=30)
model.evaluate(db_test)

同过使用预训练词向量,训练速度明显提升;经过30轮的训练,训练精度达到了0.977,测试精度0.832(存在过拟合)。

Logo

开源鸿蒙跨平台开发社区汇聚开发者与厂商,共建“一次开发,多端部署”的开源生态,致力于降低跨端开发门槛,推动万物智联创新。

更多推荐