300字范文 > 基于softmax的文本多分类模型代码实现

基于softmax的文本多分类模型代码实现

时间：2022-07-17 23:02:30

对于多分类问题，可以使用softmax来做，但是效果不是那么好，当做一个算法的练手吧

首先是数据集处理的代码：

文件名：data_loader.py

# coding: utf-8import sysfrom collections import Counterimport pdbimport numpy as npimport tensorflow.contrib.keras as krif sys.version_info[0] > 2:is_py3 = Trueelse:reload(sys)sys.setdefaultencoding("utf-8")is_py3 = Falsedef native_word(word, encoding='utf-8'):"""如果在python2下面使用python3训练的模型，可考虑调用此函数转化一下字符编码"""if not is_py3:return word.encode(encoding)else:return worddef native_content(content):if not is_py3:return content.decode('utf-8')else:return contentdef open_file(filename, mode='r'):"""常用文件操作，可在python2和python3间切换.mode: 'r' or 'w' for read or write"""if is_py3:return open(filename, mode, encoding='utf-8', errors='ignore')else:return open(filename, mode)def read_file(filename):"""读取文件数据"""contents, labels = [], []with open_file(filename) as f:for line in f:try:label, content = line.strip().split('\t')if content:contents.append(list(native_content(content)))labels.append(native_content(label))except:passreturn contents, labels#构建词汇表，使用字符级的表示，这一函数会将词汇表存储下来，避免每一次重复处理;def build_vocab(train_dir, vocab_dir, vocab_size=5000):"""根据训练集构建词汇表，存储"""#train, test, val文件的格式为：分类文字data_train, _ = read_file(train_dir)all_data = []for content in data_train:all_data.extend(content)counter = Counter(all_data) #统计所有文档中每个字出现的次数格式：{'c': 3, 'a': 1, 'b': 1} count_pairs = counter.most_common(vocab_size - 1) #取出现次数最多的部分, 格式：[('c', 3), ('a', 1)]words, _ = list(zip(*count_pairs)) #格式：[('c', 'a'), (3, 1)], words格式为：('c', 'a')# 添加一个 <PAD> 来将所有文本pad为同一长度words = ['<PAD>'] + list(words)open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n') #vocab_dir里面就是处理后的词表，每行一个字#读取上一步存储的词汇表，转换为{词：id}表示;def read_vocab(vocab_dir):"""读取词汇表"""# words = open_file(vocab_dir).read().strip().split('\n')with open_file(vocab_dir) as fp:# 如果是py2 则每个值都转化为unicodewords = [native_content(_.strip()) for _ in fp.readlines()]word_to_id = dict(zip(words, range(len(words))))return words, word_to_id# 将分类目录固定，转换为{类别: id}表示;def read_category(file):categories = [] for line in open(file, 'r'):categories.append(line.strip().split('\t')[0])categories = set(categories) categories = [native_content(x) for x in categories]cat_to_id = dict(zip(categories, range(len(categories))))return categories, cat_to_id#将一条由id表示的数据重新转换为文字;def to_words(content, words):"""将id表示的内容转换为文字"""return ''.join(words[x] for x in content)#将数据集从文字转换为固定长度的id序列表示;def process_file(filename, word_to_id, cat_to_id, max_length=600):"""将文件转换为id表示"""contents, labels = read_file(filename)data_id, label_id = [], [] for i in range(len(contents)):#实际就是将一篇文档的词id向量和一个分类id对应起来#data_id中每个元素是一篇文档的词id构成的向量data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])#label_id，每篇文档对应一个分类id，这个分类id是与一篇文档的词id向量对应label_id.append(cat_to_id[labels[i]]) # 使用keras提供的pad_sequences来将文本pad为固定长度#因为data_id中每个元素都是一个由一篇文档中的字组成的向量，而每篇文档长度不同，所以每篇文档对应的向量元素个数不同，所以这里要将他们格式化为同一长度，策略就是高位补0x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length) y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示# y_pad = parse_vector(label_id, len(cat_to_id))return x_pad, y_paddef parse_vector(label_id, size):des = []for label in label_id:c = np.full(size, 1e-8)c[label] = 1des.append(c)return np.array(des)#为神经网络的训练准备经过shuffle的批次的数据。#x为所有文档的词id向量构成的集合，是np.array类型#y为所有文档对应的标签的one-hot向量集合#注意：batch_iter这个函数返回的是一个迭代器def batch_iter(x, y, batch_size=64):"""生成批次数据"""data_len = len(x)num_batch = int((data_len - 1) / batch_size) + 1 #计算每个批次取的数据量#np.random.permutation是随机打乱一个数组, 比如将[0,1,2,3] 打乱成[3,1,0,2]#np.arange是构造一个[0, data_len]的列表indices = np.random.permutation(np.arange(data_len))x_shuffle = x[indices] #基于一个打乱的索引顺序indics，分别从x中取出对应位置的向量，并按照这个顺序组成新的x，实际就是打乱x的向量顺序y_shuffle = y[indices]for i in range(num_batch):start_id = i * batch_sizeend_id = min((i + 1) * batch_size, data_len)yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id] #每次从打乱顺序后的x和y中依次取一个批次的数据

文件名：run_softmax.py

#-*- coding:utf8 -*-import pdbimport osimport tensorflow as tfimport numpy as npfrom data_loader import read_vocab, read_category, batch_iter, process_file, build_vocabtrain_file = 'data/baike_levelone_category_train.lst'test_file = 'data/baike_levelone_category_test.lst'vocab_file = 'data/baike_category_vocab.lst'all_file = 'data/baike_levelone_category_all.lst'max_vocab_size = 5000seq_length = 600 #输入x的维度num_epochs = 10batch_size = 64def feed_data(x_batch, y_batch):feed_dict = {x: x_batch,y_: y_batch}return feed_dictif not os.path.exists(vocab_file):build_vocab(all_file, vocab_file, max_vocab_size)print 'build vocab over'#全部分类，分类对应的idcategorys, cat_to_id = read_category(all_file)print 'read category over'words, word_to_id = read_vocab(vocab_file)print 'read vocab over'x_train, y_train = process_file(train_file, word_to_id, cat_to_id, seq_length)print 'process file over'num_classes = len(cat_to_id)#定义模型with tf.device('/cpu:0'):x = tf.placeholder(tf.float32, [None, seq_length], name='input_x')y_ = tf.placeholder(tf.float32, [None, num_classes], name='input_y')# w = tf.Variable(tf.zeros([seq_length, num_classes]))# b = tf.Variable(tf.zeros([num_classes]))w = tf.Variable(tf.truncated_normal(shape=[seq_length, num_classes], mean=0, stddev=1))b = tf.Variable(tf.truncated_normal(shape=[num_classes], mean=0, stddev=1)) y_mat = tf.matmul(x,w) + b y = tf.nn.softmax(y_mat)# cost = -tf.reduce_sum(y_*tf.log(y)) #交叉熵的计算方式# cost = tf.reduce_sum(tf.square(y_-y))cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_, logits=y)train_step = tf.train.GradientDescentOptimizer(0.005).minimize(cost)print 'initial'init = tf.initialize_all_variables()print 'session'sess = tf.Session()sess.run(init)correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) #argmax是指取数组中最大的值所在的索引accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))print 'accuracy'for epoch in range(num_epochs):print('epoch:', epoch+1)batch_train = batch_iter(x_train, y_train, batch_size)#这里x_batch的维度是(batch_size, seq_length), batch_size其实就是每次取的文档的个数for x_batch, y_batch in batch_train:# pdb.set_trace() feed_dict = feed_data(x_batch, y_batch)sess.run(train_step, feed_dict=feed_dict)print 'accuracy',sess.run(accuracy, feed_dict=feed_dict)print 'y',sess.run(tf.argmax(y,1), feed_dict=feed_dict)print 'y_',sess.run(tf.argmax(y_,1), feed_dict=feed_dict) feed_dict = feed_data(x_train, y_train)print 'accuracy',sess.run(accuracy, feed_dict=feed_dict)

说明：数据集文件的格式为每行一个样本，以制表符分割，第一列为分本的分类，第二列为文本的内容

训练打印日志如下：

accuracy 0.578125y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 55 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]y_ [ 5 5 6 1 5 2 5 9 5 5 1 5 7 5 9 1 1 5 1 5 1 1 12 55 5 5 4 5 5 7 5 5 7 5 11 5 4 5 5 5 5 5 5 1 5 6 512 5 10 5 4 5 5 5 4 11 0 5 6 5 5 5]accuracy 0.5625y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 55 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]y_ [ 1 5 5 5 4 4 7 8 5 5 2 5 5 5 5 4 5 1 5 1 10 4 6 15 12 1 5 7 5 5 5 5 11 5 5 5 11 5 5 7 1 5 7 5 5 10 45 5 5 12 5 5 5 4 5 5 12 5 5 5 1 1]accuracy 0.59375y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 55 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]y_ [9 1 5 1 5 7 5 5 8 5 1 5 5 5 1 5 1 5 5 1 5 5 5 4 5 5 5 1 5 5 5 5 1 6 5 5 55 7 5 1 7 5 5 5 5 1 6 5 1 5 1 5 5 0 7 5 5 4 7 5 5 1 7]accuracy 0.625y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 55 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]y_ [11 4 5 5 5 5 5 5 5 5 11 5 5 1 5 5 11 5 5 8 5 5 11 15 2 9 5 5 1 7 5 5 5 1 1 1 1 5 5 5 5 5 5 5 5 5 55 5 12 1 1 1 5 8 5 5 5 5 1 1 1 5]accuracy 0.71875y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 55 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]y_ [ 5 6 5 5 1 5 5 8 5 5 5 5 5 5 5 5 5 5 5 1 7 12 5 55 5 5 1 5 5 5 7 5 5 4 6 4 5 5 5 5 5 5 5 5 4 5 65 1 5 9 5 1 11 5 5 1 5 5 5 5 5 5]accuracy 0.59375y [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 55 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]y_ [ 5 0 5 5 5 1 0 1 5 11 5 5 5 11 5 5 5 5 5 5 9 11 1 56 1 4 5 5 5 5 5 7 5 6 5 5 5 11 11 5 1 5 10 5 1 1 55 5 0 1 1 5 5 5 5 9 1 5 5 11 5 5]accuracy 1.0y [5]y_ [5]accuracy 0.62927836

可以看到在训练集上的准确率只有62%，这个效果很差，当然效果差的原因可能是其他方面我没有做好。

这个项目实战的学习心得：

1、我们的机器上当别人在使用GPU的时候，我如果要执行我的训练，会抛出Out_OF_MEMORY内存不够的异常，本想通过with tf.device('/cpu:0') 来指定让代码不使用GPU，但是这种方式不行，应该还需要其他的方式执行，我最终将代码移到另外一台机器上执行的

2、梯度下降算法中学习率的选择问题

学习率不要选择太大，否则很难收敛，在学习的过程中可能总是跳过最佳点位

学习率也不要选择太小，否则，学习消耗的时间太长

3、损失函数的选择

我选择的损失函数是交叉熵，我一开始是自己写的交叉熵算法，如下：

cost = -tf.reduce_sum(y_*tf.log(y))

这导致的一个问题是，在优化的过程中，w和b的值出现[nan, nan, nan, nan]的问题。

原因应该是公式中用到了log函数，因为它的参数的范围是大于0，如果给它的参数为0，就会抛异常，这应该是引起nan的原因

解决办法是，用tensorflow自带的计算交叉熵的函数，tensorflow提供了四种计算交叉熵的函数，可以选择其中一种