Saturday. February 24, 2018 - 10 mins

爬虫基础之 Requests 库

Requests 库介绍
Requests 库常用方法
高级用法
Demo
参数示例

Requests 库介绍

Requests 库是常用的http请求库，使用 requests 库可以模拟浏览器的请求，比起之前用到的urllib，requests模块的api更加便捷,更加人性化（本质就是封装了urllib3）

Requests 常用方法介绍

安装 requests

 pip3 install requests

GET请求

 import requests
 response = requests.get("http://httpbin.org/get")	 # 发送请求
 print(response.text)	#获取返回的html信息

POST 请求

 import Requests
 import json
 url = 'https://api.github.com/some/endpoint'
 headers = {'content-type': 'application/json'}
 form_data = {'username': '123456'}
 #携带请求头和数据实例
 ret = requests.post(url, data=json.dumps(form_data),headers=headers)

其它方法

 - requests.get(url, params=None, **kwargs)
 - requests.post(url, data=None, json=None, **kwargs)
 - requests.put(url, data=None, **kwargs)
 - requests.head(url, **kwargs)
 - requests.delete(url, **kwargs)
 - requests.patch(url, data=None, **kwargs)
 - requests.options(url, **kwargs)
 #以上方法均是在此方法的基础上构建
 - requests.request(method, url, **kwargs)

响应内容获取

 response.json()	#内置的 JSON 解码器
 response.headers	#字典形式展示响应头部
 response.status_code #响应状态码
 response.content 	#以字节的方式访问请求响应体
 response.url 		#获取请求的url
 response.cookies   #获取cookie

错误与异常

 - 网络问题（如：DNS 查询失败、拒绝连接等）：Requests 会抛出一个 ConnectionError 异常
 - HTTP 请求返回不成功的状态码：Response.raise_for_status() 会抛出一个 HTTPError 异常
 - 请求超时：抛出 Timeout 异常
 - 请求超过了设定的最大重定向次数：抛出 TooManyRedirects 异常
 所有Requests显式抛出的异常都继承自 requests.exceptions.RequestException

高级用法

基于以上的基础姿势，我们就可以做一些其它的骚操作啦 ~~

上传文件

 files = {'picture': open('catty.png', 'rb')}
 response = requests.post('http://httpbin.org/post', files=files)

获取cookie

 response = requests.get('https://www.baidu.com')
 for k, v in response.cookies.items():
     print(k,'',v)	# 这里根据需求保存

保持会话

会话对象让你能够跨请求保持某些参数。它也会在同一个 Session 实例发出的所有请求之间保持 cookie，期间使用 urllib3 的 connection pooling 功能。所以如果你向同一主机发送多个请求，底层的 TCP 连接将会被重用，从而带来显著的性能提升。

 session = requests.session()
 response = session.get('http://httpbin.org/cookies/set/number/123456')

保存图片

 response = requests.get('https://www.baidu.com/img/bd_logo1.png', headers=headers)
 with open('baidu.png', 'wb') as f:
 	f.write(response.content)

参考：http://docs.python-requests.org/zh_CN/latest/user/advanced.html

Demo

抽屉网点赞

 import requests
 - 方式一
 #首先登陆任何页面，获取cookie
 i1 = requests.get(url="http://dig.chouti.com/help/service")
 i1_cookies = i1.cookies.get_dict()

 #用户登陆，携带上一次的cookie，后台对cookie中的 gpsd 进行授权
 i2 = requests.post(
 	url="http://dig.chouti.com/login",
 	data={
 		'phone': "8613217981270",
 		'password': "123456",
 		'oneMonth': ""
 		},

 	cookies=i1_cookies	)

 #点赞（只需要携带已经被授权的gpsd即可）
 gpsd = i1_cookies['gpsd']
 i3 = requests.post(
 	url="http://dig.chouti.com/link/vote?linksId=14211711",
 	cookies={'gpsd': gpsd})


 - 方式二
 session = requests.Session()
 i1 = session.get(url="http://dig.chouti.com/help/service")
 i2 = session.post(
 	url="http://dig.chouti.com/login",
 	data={
 		'phone': "8613217981270",
 		'password': "123456",
 		'oneMonth': ""})
 i3 = session.post(url="http://dig.chouti.com/link/vote?linksId=14211711")
 print(i3.text)

参数示例

 def param_method_url():
 	# requests.request(method='get', url='http://127.0.0.1:8000/test/')
 	# requests.request(method='post', url='http://127.0.0.1:8000/test/')
 	pass
 def param_param():
 	# - 可以是字典
 	# - 可以是字符串
 	# - 可以是字节（ascii编码以内）

 	# requests.request(method='get',
 	# url='http://127.0.0.1:8000/test/',
 	# params={'k1': 'v1', 'k2': '水电费'})

 	# requests.request(method='get',
 	# url='http://127.0.0.1:8000/test/',
 	# params="k1=v1&k2=水电费&k3=v3&k3=vv3")

 	# requests.request(method='get',
 	# url='http://127.0.0.1:8000/test/',
 	# params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8'))

 	# 错误
 	# requests.request(method='get',
 	# url='http://127.0.0.1:8000/test/',
 	# params=bytes("k1=v1&k2=水电费&k3=v3&k3=vv3", encoding='utf8'))
 	pass


 def param_data():
 	# 可以是字典
 	# 可以是字符串
 	# 可以是字节
 	# 可以是文件对象

 	# requests.request(method='POST',
 	# url='http://127.0.0.1:8000/test/',
 	# data={'k1': 'v1', 'k2': '水电费'})

 	# requests.request(method='POST',
 	# url='http://127.0.0.1:8000/test/',
 	# data="k1=v1; k2=v2; k3=v3; k3=v4"
 	# )

 	# requests.request(method='POST',
 	# url='http://127.0.0.1:8000/test/',
 	# data="k1=v1;k2=v2;k3=v3;k3=v4",
 	# headers={'Content-Type': 'application/x-www-form-urlencoded'}
 	# )

 	# requests.request(method='POST',
 	# url='http://127.0.0.1:8000/test/',
 	# data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是：k1=v1;k2=v2;k3=v3;k3=v4
 	# headers={'Content-Type': 'application/x-www-form-urlencoded'}
 	# )
 	pass


 def param_json():
 	# 将json中对应的数据进行序列化成一个字符串，json.dumps(...)
 	# 然后发送到服务器端的body中，并且Content-Type是 {'Content-Type': 'application/json'}
 	requests.request(method='POST',
 					url='http://127.0.0.1:8000/test/',
 					json={'k1': 'v1', 'k2': '水电费'})


 def param_headers():
 	# 发送请求头到服务器端
 	requests.request(method='POST',
 					url='http://127.0.0.1:8000/test/',
 					json={'k1': 'v1', 'k2': '水电费'},
 					headers={'Content-Type': 'application/x-www-form-urlencoded'}
 					)


 def param_cookies():
 	# 发送Cookie到服务器端
 	requests.request(method='POST',
 					url='http://127.0.0.1:8000/test/',
 					data={'k1': 'v1', 'k2': 'v2'},
 					cookies={'cook1': 'value1'},
 					)
 	# 也可以使用CookieJar（字典形式就是在此基础上封装）
 	from http.cookiejar import CookieJar
 	from http.cookiejar import Cookie

 	obj = CookieJar()
 	obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/', secure=False, expires=None,
 						discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False,
 						port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False)
 				)
 	requests.request(method='POST',
 					url='http://127.0.0.1:8000/test/',
 					data={'k1': 'v1', 'k2': 'v2'},
 					cookies=obj)


 def param_files():
 	# 发送文件
 	# file_dict = {
 	# 'f1': open('readme', 'rb')
 	# }
 	# requests.request(method='POST',
 	# url='http://127.0.0.1:8000/test/',
 	# files=file_dict)

 	# 发送文件，定制文件名
 	# file_dict = {
 	# 'f1': ('test.txt', open('readme', 'rb'))
 	# }
 	# requests.request(method='POST',
 	# url='http://127.0.0.1:8000/test/',
 	# files=file_dict)

 	# 发送文件，定制文件名
 	# file_dict = {
 	# 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
 	# }
 	# requests.request(method='POST',
 	# url='http://127.0.0.1:8000/test/',
 	# files=file_dict)

 	# 发送文件，定制文件名
 	# file_dict = {
 	#     'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})
 	# }
 	# requests.request(method='POST',
 	#                  url='http://127.0.0.1:8000/test/',
 	#                  files=file_dict)

 	pass


 def param_auth():
 	from requests.auth import HTTPBasicAuth, HTTPDigestAuth

 	ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
 	print(ret.text)

 	# ret = requests.get('http://192.168.1.1',
 	# auth=HTTPBasicAuth('admin', 'admin'))
 	# ret.encoding = 'gbk'
 	# print(ret.text)

 	# ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
 	# print(ret)
 	#


 def param_timeout():
 	# ret = requests.get('http://google.com/', timeout=1)
 	# print(ret)

 	# ret = requests.get('http://google.com/', timeout=(5, 1))
 	# print(ret)
 	pass


 def param_allow_redirects():
 	ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
 	print(ret.text)


 def param_proxies():
 	# proxies = {
 	# "http": "61.172.249.96:80",
 	# "https": "http://61.185.219.126:3128",
 	# }

 	# proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}

 	# ret = requests.get("http://www.proxy360.cn/Proxy", proxies=proxies)
 	# print(ret.headers)


 	# from requests.auth import HTTPProxyAuth
 	#
 	# proxyDict = {
 	# 'http': '77.75.105.165',
 	# 'https': '77.75.105.165'
 	# }
 	# auth = HTTPProxyAuth('username', 'mypassword')
 	#
 	# r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
 	# print(r.text)

 	pass

 def param_stream():
 	ret = requests.get('http://127.0.0.1:8000/test/', stream=True)
 	print(ret.content)
 	ret.close()

 	# from contextlib import closing
 	# with closing(requests.get('http://httpbin.org/get', stream=True)) as r:
 	# # 在此处理响应。
 	# for i in r.iter_content():
 	# print(i)

 def requests_session():
 	import requests

 	session = requests.Session()

 	### 1、首先登陆任何页面，获取cookie

 	i1 = session.get(url="http://dig.chouti.com/help/service")

 	### 2、用户登陆，携带上一次的cookie，后台对cookie中的 gpsd 进行授权
 	i2 = session.post(
 		url="http://dig.chouti.com/login",
 		data={
 			'phone': "8615131255089",
 			'password': "xxxxxx",
 			'oneMonth': ""
 		}
 	)

 	i3 = session.post(
 		url="http://dig.chouti.com/link/vote?linksId=8589623",
 	)
 	print(i3.text)

Topaz

Always keep learning.