使用python3.5对weixin.sogou.com中的微信文章进行文章的爬取,浏览器为火狐浏览器,代理服务器使用fiddler,代码如下
1 import re 2 import urllib.request 3 import time 4 import urllib.error 5 def use_proxy(proxy_addr,url): 6 try: 7 req=urllib.request.Request(url) 8 req.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0‘) 9 proxy=urllib.request.ProxyHandler({‘http‘:proxy_addr}) 10 opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) 11 urllib.request.install_opener(opener) 12 data=urllib.request.urlopen(req).read() 13 return data 14 except urllib.error.URLError as e: 15 if(hasattr(e,"code")): 16 print(e.code) 17 if(hasattr(e,"reason")): 18 print(e.reason) 19 time.sleep(10) 20 except Exception as e: 21 print("exception:"+str(e)) 22 time.sleep(1) 23 24 key="Python" 25 proxy="127.0.0.1:8888" 26 for i in range(0,10): 27 key=urllib.request.quote(key) 28 thispageurl="http://weixin.sogou.com/weixin?query="+key+"&_sug_type_=&sut=1777&lkt=7%2C1519106265525%2C1519106267321&s_from=input&_sug_=y&type=2&sst0=1519106267427&page="+str(i)+"&ie=utf8&w=01019900&dr=1" 29 thispagedata=use_proxy(proxy,thispageurl) 30 print(len(str(thispagedata))) 31 pat=‘<a target="_blank" href="(.*?)"‘ 32 rs=re.compile(pat,re.S).findall(str(thispagedata)) 33 if(len(rs)==0): 34 print("第("+str(i)+")页没成功") 35 continue 36 for j in range(0,len(rs)): 37 thisurl=rs[j] 38 thisurl=thisurl.replace("amp;","") 39 file="d:/111"+str(i)+str(j)+".html" 40 thisdata=use_proxy(proxy,thisurl) 41 try: 42 fh=open(file,"wb") 43 fh.write(thisdata) 44 fh.close() 45 print("第"+str(i)+str(j)+"篇文章成功") 46 except Exception as e: 47 print(e) 48 print("第"+str(i)+str(j)+"篇文章不成功")