#! /usr/bin/env python
#coding=utf-8from scrapy.selector import Selector
from scrapy.http import Request
import re,os
from bs4 import BeautifulSoup
from scrapy.spiders import Spider
import urllib2,thread
#处理编码问题
import sys
reload(sys)
sys.setdefaultencoding('gb18030')
#flag的作用是保证第一次爬取的时候不进行单个新闻页面内容的爬取
flag=1
projectpath='E:\spider2\spider\spider'
def loop(*response):
sel = Selector(response[0])
title = sel.xpath('//h1/text()').extract()
pages=sel.xpath('//div[@id="endText"]//p/text()').extract()
p=sel.xpath('//div[@class="ep-tie-top"]/a[last()]/text()').extract()
comment_num=int(p[0])
index2=len(response[0].url)
news_id=response[0].url[index2-21:index2-5]
url_contain=urllib2.urlopen(response[0].url).read()
pattern=re.compile(r'\"productKey\" : \"+(\w(?!_))+\"')
match = pattern.search(url_contain)
index5=len(match.group())
comment_id=match.group()[16:index5-1]#haole
cmntlist=[]
idlist=[]
page=1
#含有新闻url,标题,内容,评论的文件'''
file2=None
while((page==1) or (cmntlist != [])):
#tel_count=0 #each page tel_user_count
#提取到的评论url
url="https://comment.news.163.com/api/v1/products/"+str(comment_id)+"/threads/"+str(news_id)+"/comments/hotTopList"
#print url
url_contain=urllib2.urlopen(url).read()
import json
text = json.loads(url_contain)
#转换为字典变量text
#text=eval(after)
#print text['comments'][text['commentIds'][0]]['content']
if text['comments']!=0:
cmntlist=text['comments']
idlist=text['commentIds']
else:
cmntlist=[]
idlist=[]
if cmntlist != [] and (page==1):
filename=str(news_id)+'.txt'
path=projectpath+'stock\\' +filename
file2=open(path,'a+')
news_content=str('')
for p in pages:
news_content=news_content+p+'\n'
item="<url>"+response[0].url+"</url>"+'\n\n'+"<title>"+str(title[0])+"</title>\n\n"+"<content>\n"+str(news_content)+"</content>\n\n<comment>\n"
file2.write(item)
if idlist != []:
content=''
#id_c=0
#while(idlist[id_c]!=[]):
for status in idlist:#idlist[id_c]
status_ic=str(status)[0:9]
#print status_ic
s=cmntlist[status_ic]['content']
#content=content+str(cmntlist[idlist[id_c]]['createTime'])+'\t'+str(cmntlist[idlist[id_c]]['user']['nickname'])+'\t'+s+'\n'
content=content+str(cmntlist[status_ic]['createTime'])+'\t'+str(cmntlist[status_ic]['commentId'])+'\t'+s+'\n'
file2.write(content)
file2.write("</comment>")
file2.close()
page=page+1
class SpiderSpider(Spider):
name = "stockwy"
allowed_domains = ["163.com.cn"]
#在本程序中,start_urls并不重要,因为并没有解析
start_urls = [
"https://news.163.com"
]
global projectpath
if os.path.exists(projectpath+'stock'):
pass
else:
os.mkdir(projectpath+'stock')
def parse(self, response):
sel = Selector(response)
global flag
if(flag==1):
flag=2
url="https://news.163.com/special/0001220O/news_json.js"
#伪装为浏览器
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)
url_contain = response.read()
index3=len(url_contain)
url_contain=url_contain[0:index3-1]
b='={'
after = url_contain[url_contain.index(b)+len(b)-1:]
text=eval(after)
index4=len(text['news'])
news=text['news']#[0:index4-4]
if os.path.exists(projectpath+'stock\\'+'link'):
pass
else:
os.mkdir(projectpath+'stock\\'+'link')
filename='link.txt'
path=projectpath+'stock\\link\\' + filename
filelink=open(path,'a+')
if news != []:
category=0
for category in range(0,3):
for status_dic in news[category]:
mil_link=status_dic['l']
filelink.write(str(mil_link)+'\n')
#递归调用parse,传入新的爬取url
linkk=mil_link
yield Request(linkk, callback=self.parse, dont_filter=True)
category=category+1
#print category
#对单个新闻页面新建线程进行爬取
if flag!=1:
if (response.status != 404) and (response.status != 502):
thread.start_new_thread(loop,(response,))
评论