应用python进行自然语言处理分析-舆情文本4-模型接口JSON-API


项目描述

基于舆情数据建模部署,进行机器学习模型的API开发、应用。

  1. 实现发送json格式数据;
  2. 实现json接口-处理1条数据;
  3. 实现json接口-处理多条数据;
  4. 使用postman进行测试。

实现发送json格式数据

实现

#测试json
equtypes = [
    {
        '锅炉': 1, "压力容器": 2, "压力管道": 3, "电梯": 4, "起重机械":5, "客运索道": 6,"大型游乐设施": 7,"场(厂)内专用机动车辆": 8,"其他": 9
    }
]
@app.route('/json',methods=['GET'])
def get_json():
    # 返回json格式特种设备类型名称
    return jsonify({'equtypes': equtypes})

测试

实现json接口-处理1条数据

实现

@app.route('/getEquTypeAPI',methods=['POST'])
def get_EquTypeAPI():
    if request.method=='POST':
        result=request.json
        result_df=pd.DataFrame(result)
        pubwords = result_df.loc[1,'words']

        # 停用词表路径
        stop_words_path = '.\stop_words'
        # 加载停用词表
        stopwords1 = [line.rstrip() for line in
                      open(os.path.join(stop_words_path, '中文停用词库.txt'), 'r', encoding='utf-8')]
        stopwords2 = [line.rstrip() for line in
                      open(os.path.join(stop_words_path, '哈工大停用词表.txt'), 'r', encoding='utf-8')]
        stopwords3 = [line.rstrip() for line in
                      open(os.path.join(stop_words_path, '四川大学机器智能实验室停用词库.txt'), 'r', encoding='utf-8')]
        stopwords4 = [line.rstrip() for line in
                      open(os.path.join(stop_words_path, '自建停用词库.txt'), 'r', encoding='utf-8')]

        stopwords = stopwords1 + stopwords2 + stopwords3 + stopwords4

        # 处理文本数据
        def proc_text(raw_line):
            """
                处理文本数据
                返回分词结果
            """

            # 1. 使用正则表达式去除非中文字符
            filter_pattern = re.compile('[^\u4E00-\u9FD5]+')
            chinese_only = filter_pattern.sub('', raw_line)

            ## 2. 结巴分词+词性标注
            # word_list = pseg.cut(chinese_only)

            # jieba分词
            # seg_list = jieba.cut("这是一句话", cut_all=True) # 全模式
            # print("全模式: " + "/ ".join(seg_list))  # 全模式

            seg_list = jieba.cut(chinese_only, cut_all=False)  # 精确模式
            # print("精确模式: " + "/ ".join(seg_list))  # 精确模式

            # 3. 去除停用词,保留有意义的词性
            # 动词,形容词,副词
            # used_flags = ['v', 'a', 'ad']
            meaninful_words = []
            # for word, flag in word_list:
            for word in seg_list:
                # if (word not in stopwords) and (flag in used_flags):
                if (word not in stopwords):
                    meaninful_words.append(word)
            return ' '.join(meaninful_words)
            return ' '.join(seg_list)  # 返回内容,而非地址,如果直接 return seg_list 返回的是地址

        pub_test = proc_text(pubwords)
        pub_test_words = [pub_test]  # 为训练集创建list对象
        print(pub_test_words[0])

        vec_test_word = vec.transform(pub_test_words)
        print(vec_test_word)
        print(vec_test_word.shape)

        new_classifier = joblib.load('joblib_classifier.pkl')
        prediction = new_classifier.predict(vec_test_word[0:1])
        
        return jsonify({'prediction': str(prediction)})

测试

实现json接口-处理多条数据

实现

@app.route('/getEquTypeAPI',methods=['POST'])
def get_EquTypeAPI():
    if request.method=='POST':
        result=request.json
        result_df=pd.DataFrame(result)

        #pubwords = result_df.loc[1,'words']

        # 停用词表路径
        stop_words_path = '.\stop_words'
        # 加载停用词表
        stopwords1 = [line.rstrip() for line in
                      open(os.path.join(stop_words_path, '中文停用词库.txt'), 'r', encoding='utf-8')]
        stopwords2 = [line.rstrip() for line in
                      open(os.path.join(stop_words_path, '哈工大停用词表.txt'), 'r', encoding='utf-8')]
        stopwords3 = [line.rstrip() for line in
                      open(os.path.join(stop_words_path, '四川大学机器智能实验室停用词库.txt'), 'r', encoding='utf-8')]
        stopwords4 = [line.rstrip() for line in
                      open(os.path.join(stop_words_path, '自建停用词库.txt'), 'r', encoding='utf-8')]

        stopwords = stopwords1 + stopwords2 + stopwords3 + stopwords4

        # 处理文本数据
        def proc_text(raw_line):
            """
                处理文本数据
                返回分词结果
            """

            # 1. 使用正则表达式去除非中文字符
            filter_pattern = re.compile('[^\u4E00-\u9FD5]+')
            chinese_only = filter_pattern.sub('', raw_line)

            ## 2. 结巴分词+词性标注
            # word_list = pseg.cut(chinese_only)

            # jieba分词
            # seg_list = jieba.cut("这是一句话", cut_all=True) # 全模式
            # print("全模式: " + "/ ".join(seg_list))  # 全模式

            seg_list = jieba.cut(chinese_only, cut_all=False)  # 精确模式
            # print("精确模式: " + "/ ".join(seg_list))  # 精确模式

            # 3. 去除停用词,保留有意义的词性
            # 动词,形容词,副词
            # used_flags = ['v', 'a', 'ad']
            meaninful_words = []
            # for word, flag in word_list:
            for word in seg_list:
                # if (word not in stopwords) and (flag in used_flags):
                if (word not in stopwords):
                    meaninful_words.append(word)
            return ' '.join(meaninful_words)
            return ' '.join(seg_list)  # 返回内容,而非地址,如果直接 return seg_list 返回的是地址

        
        result_df['words_'] = result_df['words'].apply(proc_text)

        pubwords = []
        for line_index in range(len(result_df['words_'])):
            try:
                pubwords.append(' '.join(result_df.loc[line_index,'words_']))  # 转为list对象
            except:
                print(line_index)

        


        new_classifier = joblib.load('joblib_classifier.pkl')
        prediction = list(new_classifier.predict(vec.transform(pubwords)))

        return jsonify({'prediction': str(prediction)})

测试

整体部署应用方案

  1. https://github.com/ccmajor/ml-flask-EquType-Prediction
  2. https://gitee.com/stdfirm/ml-flask-EquType-Prediction

致谢

在API开发过程中,参考学习了Turning Machine Learning Models into APIs in Python
因为有牛人的存在,让技术菜鸟的我这一段路走的快了很多;希望能够对其他人有所帮助,加油~


文章作者: 小酌笔墨
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 小酌笔墨 !
  目录