import pandas as pd import numpy as np # Read training data set mblog=pd.read_csv(
'clean_mblog.csv',encoding='utf_8_sig') mblog.head()

import jieba.analyse def get_keywords(raw): if raw['isLongText'] == 1: #
When text For long text , extract 50 Keywords keywords = jieba.analyse.extract_tags(raw['chinese_text'],
topK=50) keywords = '|'.join(keywords) return keywords else: #
When text When it is non long text , Default extraction 20 Keywords keywords = jieba.analyse.extract_tags(raw['chinese_text']
) keywords = '|'.join(keywords) return keywords # generate keyword data mblog['keywords'] =
mblog.apply(get_keywords,axis=1) # Introducing logistic regression model from sklearn.linear_model import
LogisticRegressionfrom sklearn.feature_extraction.text import CountVectorizer
vectorizer= CountVectorizer(max_features=2000,analyzer='word',tokenizer=lambda s
:s.split('|')) # Generating bag vector data blog_bow = vectorizer.fit_transform(mblog['keywords'])
# Selected training sample data y = mblog[mblog['attitude'].notnull()]['attitude'] X=blog_bow[:len(y),
:] # Train the model and view the training accuracy lr_model = LogisticRegression(random_state=0,solver='lbfgs',
multi_class='multinomial').fit(X,y) lr_model.score(X,y) # 0.99 # Forecast other microblogs and view the results
print(mblog.chinese_text.iloc[8]) print(lr_model.predict(blog_bow[8,:]))
Huawei Mate30 Protective case exposure , It feels so ugly [0.]
In the training samples, the microblog was manually labeled as -1: negative
When the model is labeled as forecast, the 0: neutral
# Check the distribution of various attitudes in training data import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'
]=['SimHei'] # It is used to display Chinese label normally %matplotlib inline predict_attitude = pd.DataFrame(
lr_model.predict(blog_bow[:,:]),columns=['predict']) plt.figure(figsize=[15,5])
plt.subplot(1,2,1) labels = [' negative ',' neutral ',' positive '] plt.title(' The distribution of various attitudes in training data ') mblog.
groupby('attitude').size().plot.pie(labels=labels,autopct='%1.1f%%',explode=(
0.05,0.05,0.05),shadow=True) plt.subplot(1,2,2) labels = [' negative ',' neutral ',' positive '] plt.
title(' Attitude distribution of all predicted results ') predict_attitude.groupby('predict').size().plot.pie(
labels=labels,autopct='%1.1f%%',explode=(0.05,0.05,0.05),shadow=True)

model analysis :
According to the prediction of multiple microblog data results , The model predicts that most microblogs are positive , Characteristics of analysis data , It may be due to the uneven distribution of data
This hypothesis can be verified by looking at the distribution of various attitudes .

Technology