使用代理服务器挖掘微信文章代码

时间：2018-02-20 14:42:59 收藏：0 阅读：355

使用python3.5对weixin.sogou.com中的微信文章进行文章的爬取，浏览器为火狐浏览器，代理服务器使用fiddler，代码如下

 1 import re
 2 import urllib.request
 3 import time
 4 import urllib.error
 5 def use_proxy(proxy_addr,url):
 6     try:
 7       req=urllib.request.Request(url)
 8       req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0‘)
 9       proxy=urllib.request.ProxyHandler({‘http‘:proxy_addr})
10       opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
11       urllib.request.install_opener(opener)
12       data=urllib.request.urlopen(req).read()
13       return data
14     except urllib.error.URLError as e:
15         if(hasattr(e,"code")):
16             print(e.code)
17         if(hasattr(e,"reason")):
18             print(e.reason)
19         time.sleep(10)
20     except Exception as e:
21         print("exception:"+str(e))
22         time.sleep(1)
23 
24 key="Python"
25 proxy="127.0.0.1:8888"
26 for i in range(0,10):
27     key=urllib.request.quote(key)
28     thispageurl="http://weixin.sogou.com/weixin?query="+key+"&_sug_type_=&sut=1777&lkt=7%2C1519106265525%2C1519106267321&s_from=input&_sug_=y&type=2&sst0=1519106267427&page="+str(i)+"&ie=utf8&w=01019900&dr=1"
29     thispagedata=use_proxy(proxy,thispageurl)
30     print(len(str(thispagedata)))
31     pat=‘<a target="_blank" href="(.*?)"‘
32     rs=re.compile(pat,re.S).findall(str(thispagedata))
33     if(len(rs)==0):
34         print("第("+str(i)+")页没成功")
35         continue
36     for j in range(0,len(rs)):
37         thisurl=rs[j]
38         thisurl=thisurl.replace("amp;","")
39         file="d:/111"+str(i)+str(j)+".html"
40         thisdata=use_proxy(proxy,thisurl)
41         try:
42             fh=open(file,"wb")
43             fh.write(thisdata)
44             fh.close()
45             print("第"+str(i)+str(j)+"篇文章成功")
46         except Exception as e:
47             print(e)
48             print("第"+str(i)+str(j)+"篇文章不成功")