香菇

#! /usr/bin/env python

#coding=utf-8
from scrapy.selector import Selector
from scrapy.http import Request
import re,os
from bs4 import BeautifulSoup
from scrapy.spiders import Spider
import urllib2,thread
#处理编码问题 
import sys
reload(sys) 
sys.setdefaultencoding('gb18030') 
#flag的作用是保证第一次爬取的时候不进行单个新闻页面内容的爬取 
flag=1
projectpath='E:\spider2\spider\spider' 
def loop(*response): 
    sel = Selector(response[0]) 
    title = sel.xpath('//h1/text()').extract() 
    pages=sel.xpath('//div[@id="endText"]//p/text()').extract() 
    p=sel.xpath('//div[@class="ep-tie-top"]/a[last()]/text()').extract() 
    comment_num=int(p[0]) 
    index2=len(response[0].url) 
    news_id=response[0].url[index2-21:index2-5] 
    url_contain=urllib2.urlopen(response[0].url).read() 
    pattern=re.compile(r'\"productKey\" : \"+(\w(?!_))+\"') 
    match = pattern.search(url_contain) 
    index5=len(match.group()) 
    comment_id=match.group()[16:index5-1]#haole
    cmntlist=[] 
    idlist=[] 
    page=1
    #含有新闻url,标题,内容,评论的文件''' 
    file2=None
    while((page==1) or (cmntlist != [])): 
      #tel_count=0 #each page tel_user_count
      #提取到的评论url
      url="https://comment.news.163.com/api/v1/products/"+str(comment_id)+"/threads/"+str(news_id)+"/comments/hotTopList" 
      #print url
      url_contain=urllib2.urlopen(url).read() 
      import json
      text = json.loads(url_contain) 
      #转换为字典变量text
      #text=eval(after) 
      #print text['comments'][text['commentIds'][0]]['content'] 
      if text['comments']!=0: 
        cmntlist=text['comments'] 
        idlist=text['commentIds'] 
      else: 
        cmntlist=[] 
        idlist=[] 
      if cmntlist != [] and (page==1): 
        filename=str(news_id)+'.txt' 
        path=projectpath+'stock\\' +filename
        file2=open(path,'a+') 
        news_content=str('') 
        for p in pages: 
          news_content=news_content+p+'\n' 
        item="<url>"+response[0].url+"</url>"+'\n\n'+"<title>"+str(title[0])+"</title>\n\n"+"<content>\n"+str(news_content)+"</content>\n\n<comment>\n" 
        file2.write(item) 
      if idlist != []: 
        content='' 
        #id_c=0
        #while(idlist[id_c]!=[]): 
        for status in idlist:#idlist[id_c] 

            status_ic=str(status)[0:9] 
            #print status_ic
            s=cmntlist[status_ic]['content'] 
            #content=content+str(cmntlist[idlist[id_c]]['createTime'])+'\t'+str(cmntlist[idlist[id_c]]['user']['nickname'])+'\t'+s+'\n' 
            content=content+str(cmntlist[status_ic]['createTime'])+'\t'+str(cmntlist[status_ic]['commentId'])+'\t'+s+'\n' 
        file2.write(content) 
      file2.write("</comment>") 
      file2.close() 
      page=page+1

class SpiderSpider(Spider): 
  name = "stockwy" 
  allowed_domains = ["163.com.cn"] 
  #在本程序中,start_urls并不重要,因为并没有解析 
  start_urls = [ 
    "https://news.163.com" 
  ] 
  global projectpath
  if os.path.exists(projectpath+'stock'): 
    pass
  else: 
    os.mkdir(projectpath+'stock') 


  def parse(self, response): 
    sel = Selector(response) 
    global flag
    if(flag==1): 
        flag=2
        url="https://news.163.com/special/0001220O/news_json.js" 
        #伪装为浏览器 
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 
        headers = { 'User-Agent' : user_agent } 
        req = urllib2.Request(url, headers=headers) 
        response = urllib2.urlopen(req) 
        url_contain = response.read() 
        index3=len(url_contain) 
        url_contain=url_contain[0:index3-1] 
        b='={' 
        after = url_contain[url_contain.index(b)+len(b)-1:] 
        text=eval(after) 
        index4=len(text['news']) 
        news=text['news']#[0:index4-4] 

        if os.path.exists(projectpath+'stock\\'+'link'): 
            pass
        else: 
            os.mkdir(projectpath+'stock\\'+'link') 
        filename='link.txt' 
        path=projectpath+'stock\\link\\' + filename
        filelink=open(path,'a+') 
        if news != []: 
            category=0
            for category in range(0,3): 
                for status_dic in news[category]: 
                    mil_link=status_dic['l'] 
                    filelink.write(str(mil_link)+'\n') 
            #递归调用parse,传入新的爬取url
                    linkk=mil_link
                    yield Request(linkk, callback=self.parse, dont_filter=True) 
                    category=category+1
                #print category
            #对单个新闻页面新建线程进行爬取 
    if flag!=1: 
      if (response.status != 404) and (response.status != 502): 
        thread.start_new_thread(loop,(response,)) 


评论