爬虫库urllib的使用

时间：2021-07-19 16:46:08 收藏：0 阅读：0
# -*- coding: utf-8 -*-
# @Time : 2021/7/18 11:40
# @Author :liuw
# @File : testUrllib.py
# @Software: PyCharm

import urllib.request
import urllib.parse

# 获取get请求
# response = urllib.request.urlopen("http://www.baidu.com")
# print(response.read().decode(‘utf-8‘))  # 建议对返回到的网页源码进行解码

# 获取post请求 发送一个表单数据（把用户名和密码进行加密传输）模拟用户登录，现实可能还会携带cookie信息
# 介绍一个网站 http://httpbin.org 当你向其发送访问时，它会告诉你得到什么响应结果 测试http和https
# A simple HTTP Request & Response Service.
# data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8") #post封装的data
# response = urllib.request.urlopen("http://httpbin.org/post",data=data)
# print(response.read().decode(‘utf-8‘))  # 该数据和网站执行post请求得到的信息是一致的

#得到的数据 "User-Agent": "Python-urllib/3.8", 告诉网站我们是爬虫程序，真实的浏览器访问得到的User-Agent为
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",

# 超时处理机制 0.01s是模拟网站实际访问的超时处理机制时间
# try:
#     response = urllib.request.urlopen("http://httpbin.org/get",timeout=0.01)
#     print(response.read().decode(‘utf-8‘))
# except urllib.error.URLError as e:
#     print(‘time out!‘)


# response = urllib.request.urlopen("http://douban.com")
# print(response.status)  # 报418 发现你是爬虫程序

#拿到server属性
# response = urllib.request.urlopen("http://baidu.com")
# print(response.getheader("Server"))

‘‘‘
{
  "args": {},
  "data": "",
  "files": {},
  "form": {},
  "headers": {
    "Accept": "application/json",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Content-Length": "0",
    "Host": "httpbin.org",
    "Origin": "http://httpbin.org",
    "Referer": "http://httpbin.org/",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "X-Amzn-Trace-Id": "Root=1-60f3a4ea-5c26c4624ab4486627dc377e"
  },
  "json": null,
  "origin": "113.116.176.73",
  "url": "http://httpbin.org/post"
}
‘‘‘
#伪装成浏览器去访问豆瓣，也可以携带一些数据，headers 以及method方式
# 如何通过浏览器找到User-Agent 访问百度地址，红灯点开刷新百度网址，然后在点击停止红灯：https://jingyan.baidu.com/article/95c9d20d7bca17ec4e7561a4.html
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# url = "http://httpbin.org/post"
# data = bytes(urllib.parse.urlencode({"name":"eric"}),encoding="utf-8") #post封装的data
# # 构建一个请求对象 包含请求的头部信息、data、url
# req = urllib.request.Request(url=url,data=data,headers=headers,method="POST")
# response = urllib.request.urlopen(req)
# print(response.read().decode(‘utf-8‘))

# 模拟浏览器去访问豆瓣网站
url="http://douban.com"
# 构建一个请求对象 包含请求的头部信息、data、url
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode(‘utf-8‘))