0%

使用sklearn中贝叶斯分类算法预测药品生产企业缺陷GMP条款

使用sklearn中的naive_bayes分类算法预测对药品生产企业检查中发现缺陷所属的药品GMP条款。

scikit-learn是基于python的用于数据挖掘和数据分析的工具包,集成了Numpy、Scipy、matplotlib工具。本文利用sklearn中的贝叶斯分类算法,预测药品生产企业缺陷所属的药品GMP条款。

该功能由三个代码文件实现。sample2npy.py导入csv源数据,分词,生成词汇表、训练向量矩阵、分类矩阵并保存到磁盘。save_fit.py读取词汇表、训练向量矩阵与分类矩阵,训练并将训练结果保存到磁盘。csv源数据及以上两个程序在ubuntu中通过设置crontab每天凌晨定时导出、执行,以确保训练集最新。pre_gmp_clause.py当用户编辑缺陷时执行,生成预测条款供用户参考。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# sample2npy.py
#-*-coding:utf-8-*-
import sys
import os
import numpy as np
import jieba
import re
import csv
import operator

VOCABLIST_FILE = 'vocablist.npy' # 词汇表
TRAINMAT_FILE = 'trainmat.npy' # 训练向量矩阵
CLASSVEC_FILE = 'classvec.npy' # 分类矩阵
SAMPLE_FILE = 'gmp_clause_sample.csv' # csv源数据

def seg_sentence(sentence):
chinese_punctuation = (',','“','”',':',';','。','(',')','《','》',',','.',';',' ','、','等','的','在','与','之','上','将','中','一','二','三','四','五','六','七','八''九','十','为','了','——','—','-')
s = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]", "", sentence.strip())
seg_list = jieba.cut(s,cut_all=False)
seg_list = [x for x in seg_list if not x.replace('.','').replace('-','').isdigit()]
seg_list = [x for x in seg_list if x not in chinese_punctuation]
return seg_list

def loadDataSet():
defect_list = []
classVec = []
with open(SAMPLE_FILE) as csvfile:
reader = csv.reader(csvfile)
for row in reader:
if len(row)<2:
continue
seg_list = seg_sentence(row[2])
defect_list.append(seg_list)
classVec.append(int(row[1]))
return defect_list,classVec

def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
# else:
# print("the word: %s is not in my Vocabulary!" % word)
return returnVec

def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)

if __name__ == "__main__":
defect_list, class_vec = loadDataSet()
myVocabList = createVocabList(defect_list)
np.save(VOCABLIST_FILE,myVocabList)
np.save(CLASSVEC_FILE,class_vec)
trainMat = []
for defect in defect_list:
trainMat.append(setOfWords2Vec(myVocabList,defect))
np.save(TRAINMAT_FILE,trainMat)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# save_fit.py
#-*-coding:utf-8-*-
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib

VOCABLIST_FILE = 'vocablist.npy'
TRAINMAT_FILE = 'trainmat.npy'
CLASSVEC_FILE = 'classvec.npy'
CLF_FIT_FILE = 'clf_fit.pkl'

if __name__ == "__main__":
vocab_list = np.load(VOCABLIST_FILE)
train_mat = np.load(TRAINMAT_FILE)
class_list = np.load(CLASSVEC_FILE)

clf = MultinomialNB()
clf.fit(train_mat,class_list)
joblib.dump(clf,CLF_FIT_FILE)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#pre_gmp_clause.py
#-*-coding:utf-8-*-
import sys
import os
import numpy as np
import jieba
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib
import argparse
import psycopg2

VOCABLIST_FILE = 'vocablist.npy'
TRAINMAT_FILE = 'trainmat.npy'
CLASSVEC_FILE = 'classvec.npy'
CLF_FIT_FILE = 'clf_fit.pkl'

def seg_sentence(sentence):
chinese_punctuation = (',','“','”',':',';','。','(',')','《','》',',','.',';',' ','、','等','的','在','与','之','上','将','中','一','二','三','四','五','六','七','八''九','十','为','了','——','—','-')
s = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]", "", sentence.strip())
seg_list = jieba.cut(s,cut_all=False)
seg_list = [x for x in seg_list if not x.replace('.','').replace('-','').isdigit()]
seg_list = [x for x in seg_list if x not in chinese_punctuation]
return seg_list

def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
return returnVec

def make_txt_out(result,output):
appendix = int(result/1000)
clause = int(result%1000)
rows = ''
try:
conn = psycopg2.connect(database="dbname",user="username",password="passwd",port="port")
cursor = conn.cursor()
cursor.execute("""select appendix_name,snumber,content from gmpclause where appendix={0} and snumber={1}""".format(appendix,clause))
rows = cursor.fetchall()
except Exception as e:
print(e)
if rows:
appendix_name,snumber,content = rows[0][0],rows[0][1],rows[0][2]
result_str = ''
if appendix_name != 'GMP':
result_str += '附录:' + appendix_name + ', '
result_str += '第{0}条:{1}'.format(snumber,content)
with open(output,'w') as fh:
fh.write(result_str)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
# -o output.txt -d defect
parser.add_argument('-o','--output',help='结果输出文件')
parser.add_argument('-d','--defect',help='缺陷描述')
args = parser.parse_args()
output = args.output
sentence = args.defect
vocab_list = np.load(VOCABLIST_FILE)
vocab_list = vocab_list.tolist()
defect_vec = np.array(setOfWords2Vec(vocab_list,seg_sentence(sentence)))
defect_vec = defect_vec.reshape(1,defect_vec.shape[0])
clf = joblib.load(CLF_FIT_FILE)
clf_result = clf.predict(defect_vec)
result = clf_result[0]
make_txt_out(result,output)

运行截图:
运行截图