from __future__ import print_function
import json
import re #正则匹配
import time #时间处理模块
import jieba #中文分词
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from PIL import Image
from wordcloud import WordCloud #绘制词云模块
import paddlehub as hub
import requests
#请求爱奇艺评论接口,返回response信息
def getMovieinfo(url):
'''
请求爱奇艺评论接口,返回response信息
参数 url: 评论的url
:return: response信息
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
session=requests.Session()
response=session.get(url,headers=headers)
assert response.status_code==200,"网页返回状态码异常"
return response.text
#解析json数据,获取评论
def saveMovieInfoToFile(lastId,arr):
'''
解析json数据,获取评论
参数 lastId:最后一条评论ID arr:存放文本的list
:return: 新的lastId
'''
url='https://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id=15068699100&hot_size=0&last_id='+str(lastId)
resJson=json.loads(getMovieinfo(url))
comments=resJson['data']['comments']
arr+=[comment['content'] for comment in comments if 'content' in comment]
# 返回最后一项的id
return comments[-1]['id']
def stopwordslist(file_path):
'''
创建停用词表
参数 file_path:停用词文本路径
return:停用词list
'''
with open(file_path, 'r',encoding='UTF-8') as f:
words = f.read().splitlines()
return words
def movestopwords(sentence,stopwords,counts):
'''
去除停用词,统计词频
参数 sentence:分词后的句子 stopwords:停用词list counts: 词频统计结果
return:None
'''
for word in sentence:
if word not in stopwords and len(word)!=1:
counts[word]=counts.get(word,0)+1
return None
def text_detection(file_path):
'''
使用hub对评论进行内容分析
return:分析结果
'''
porn_res=[]
with open(file_path,'r') as f:
porn=hub.Module(name='porn_detection_lstm')
# 不需要detect重复的句子 使用集合去重
comments={x.strip() for x in f if len(x.strip())!=1}
res=porn.detection(data={'text':list(comments)},use_gpu=True,batch_size=50)
for index,item in enumerate(res):
if item['porn_detection_key']=='porn':
porn_res.append(item['text']+":"+str(item['porn_probs']))
return porn_res
#评论是多分页的,得多次请求爱奇艺的评论接口才能获取多页评论,有些评论含有表情、特殊字符之类的
#num 是页数,一页10条评论,假如爬取1000条评论,设置num=100
if __name__ == "__main__":
stopwords_file='baidu_stopwords.txt'
clear_comments_file='comments.txt'
num=100
use_text=1
arr=[]
counts={}
lastId=0
for i in range(num):
lastId=saveMovieInfoToFile(lastId,arr)
time.sleep(0.5)
# 评论写入到txt中 使用with可以不需要在意文件的关闭
with open(clear_comments_file,'a') as f:
for comment in arr:
clear_comment=clear_special_char(comment)
for sentence in clear_comment:
if sentence.strip()!='':
try:
f.write(sentence+'\n')
except Exception as e:
print('有特殊字符')
print(len(arr))
for comment in arr:
movestopwords(fenci(comment),stopwordslist(stopwords_file),counts)
drawcounts(counts,10)
drawcloud(counts)
porn_all=text_detection(clear_comments_file)
# 打印和谐信息
for porn_mes in porn_all:
print(porn_mes)