🌸 欢迎来到Python办公自动化专栏—Python处理办公问题,解放您的双
💻 个人主页——>个人主页欢迎访问
😸 Github主页——>Github主页欢迎访问
❓ 知乎主页——>知乎主页欢迎访问
🏳️🌈 CSDN博客主页:请点击——> 一晌小贪欢的博客主页求关注
👍 该系列文章专栏:请点击——>Python办公自动化专栏求订阅
🕷 此外还有爬虫专栏:请点击——>Python爬虫基础专栏求订阅
📕 此外还有python基础专栏:请点击——>Python基础学习专栏求订阅
文章作者技术和水平有限,如果文中出现错误,希望大家能指正🙏
❤️ 欢迎各位佬关注! ❤️
课程目标
- 深入掌握requests库的使用方法
- 学会处理各种HTTP请求参数
- 理解会话管理和Cookie处理
- 掌握处理不同响应格式的方法
1. requests库概述
requests是Python中最受欢迎的HTTP库,它让HTTP请求变得简单而优雅。
1.1 requests的优势
- 简洁的API设计
- 自动处理编码
- 内置JSON支持
- 会话对象支持
- SSL验证
- 连接池
2. 基本HTTP请求
2.1 GET请求
import requests
# 基本GET请求
response = requests.get('https://httpbin.org/get')
print(response.text)
# 带参数的GET请求
params = {
'key1': 'value1',
'key2': 'value2'
}
response = requests.get('https://httpbin.org/get', params=params)
print(response.url) # 查看完整URL
2.2 POST请求
# 发送表单数据
data = {
'username': 'admin',
'password': '123456'
}
response = requests.post('https://httpbin.org/post', data=data)
# 发送JSON数据
json_data = {
'name': '张三',
'age': 25
}
response = requests.post('https://httpbin.org/post', json=json_data)
2.3 其他HTTP方法
# PUT请求
response = requests.put('https://httpbin.org/put', data={'key': 'value'})
# DELETE请求
response = requests.delete('https://httpbin.org/delete')
# HEAD请求(只获取响应头)
response = requests.head('https://httpbin.org/get')
3. 请求参数详解
3.1 请求头设置
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Referer': 'https://www.google.com'
}
response = requests.get('https://httpbin.org/headers', headers=headers)
3.2 超时设置
try:
# 设置连接超时和读取超时
response = requests.get('https://httpbin.org/delay/5', timeout=(3, 10))
except requests.exceptions.Timeout:
print("请求超时")
except requests.exceptions.RequestException as e:
print(f"请求异常:{e}")
3.3 代理设置
proxies = {
'http': 'http://proxy.example.com:8080',
'https': 'https://proxy.example.com:8080'
}
response = requests.get('https://httpbin.org/ip', proxies=proxies)
4. 响应对象处理
4.1 响应属性
response = requests.get('https://httpbin.org/get')
print(f"状态码:{response.status_code}")
print(f"响应头:{response.headers}")
print(f"编码:{response.encoding}")
print(f"URL:{response.url}")
print(f"请求耗时:{response.elapsed}")
4.2 响应内容获取
# 文本内容
text_content = response.text
# 二进制内容
binary_content = response.content
# JSON内容
try:
json_content = response.json()
except ValueError:
print("响应不是有效的JSON格式")
# 原始响应
raw_content = response.raw
4.3 状态码检查
response = requests.get('https://httpbin.org/status/404')
if response.status_code == 200:
print("请求成功")
elif response.status_code == 404:
print("页面不存在")
else:
print(f"请求失败,状态码:{response.status_code}")
# 使用raise_for_status()自动检查
try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
print(f"HTTP错误:{e}")
5. 会话管理
5.1 Session对象
# 创建会话对象
session = requests.Session()
# 设置会话级别的请求头
session.headers.update({
'User-Agent': 'My Spider 1.0'
})
# 使用会话发送请求
response1 = session.get('https://httpbin.org/cookies/set/sessioncookie/123456789')
response2 = session.get('https://httpbin.org/cookies')
print(response2.json()) # 可以看到cookie被保持了
5.2 Cookie处理
import requests
from requests.cookies import RequestsCookieJar
# 手动设置Cookie
cookies = RequestsCookieJar()
cookies.set('cookie_name', 'cookie_value', domain='httpbin.org')
response = requests.get('https://httpbin.org/cookies', cookies=cookies)
# 从响应中获取Cookie
response = requests.get('https://httpbin.org/cookies/set/test/value')
print(response.cookies['test'])
6. 文件上传和下载
6.1 文件上传
# 上传文件
files = {'file': open('example.txt', 'rb')}
response = requests.post('https://httpbin.org/post', files=files)
# 指定文件名和内容类型
files = {
'file': ('report.csv', open('report.csv', 'rb'), 'text/csv')
}
response = requests.post('https://httpbin.org/post', files=files)
6.2 文件下载
def download_file(url, filename):
"""下载文件的函数"""
try:
response = requests.get(url, stream=True)
response.raise_for_status()
with open(filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"文件下载成功:{filename}")
except Exception as e:
print(f"下载失败:{e}")
# 使用示例
download_file('https://httpbin.org/image/png', 'test_image.png')
7. 异常处理
7.1 常见异常类型
import requests
from requests.exceptions import (
RequestException,
ConnectionError,
HTTPError,
Timeout,
TooManyRedirects
)
def safe_request(url):
"""安全的请求函数"""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response
except ConnectionError:
print("连接错误")
except HTTPError as e:
print(f"HTTP错误:{e}")
except Timeout:
print("请求超时")
except TooManyRedirects:
print("重定向次数过多")
except RequestException as e:
print(f"请求异常:{e}")
return None
8. 实战案例:爬取天气信息
import requests
import json
class WeatherSpider:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def get_weather(self, city):
"""获取指定城市的天气信息"""
url = f"https://api.openweathermap.org/data/2.5/weather"
params = {
'q': city,
'appid': 'your_api_key', # 需要注册获取API密钥
'units': 'metric',
'lang': 'zh_cn'
}
try:
response = self.session.get(url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
return self.parse_weather_data(data)
except requests.exceptions.RequestException as e:
print(f"请求失败:{e}")
return None
def parse_weather_data(self, data):
"""解析天气数据"""
weather_info = {
'city': data['name'],
'temperature': data['main']['temp'],
'description': data['weather'][0]['description'],
'humidity': data['main']['humidity'],
'pressure': data['main']['pressure']
}
return weather_info
# 使用示例
if __name__ == "__main__":
spider = WeatherSpider()
weather = spider.get_weather("Beijing")
if weather:
print(json.dumps(weather, ensure_ascii=False, indent=2))
9. 性能优化技巧
9.1 连接池
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
session = requests.Session()
# 设置重试策略
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
9.2 流式下载
def download_large_file(url, filename):
"""流式下载大文件"""
with requests.get(url, stream=True) as response:
response.raise_for_status()
with open(filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
10. 实践练习
练习1:模拟登录
编写程序模拟登录某个网站,并保持会话状态。
练习2:批量下载
编写程序批量下载一系列图片文件。
练习3:API调用
调用一个公开的API接口,获取并解析JSON数据。
11. 课程小结
本课程我们深入学习了:
- requests库的基本使用方法
- 各种HTTP请求参数的设置
- 响应对象的处理方法
- 会话管理和Cookie处理
- 文件上传和下载
- 异常处理机制
- 性能优化技巧
12. 下节预告
下一课我们将学习:
- HTML解析库BeautifulSoup的使用
- CSS选择器和XPath表达式
- 从网页中提取结构化数据
- 处理复杂的HTML结构
13. 作业
- 使用requests库爬取一个新闻网站的首页
- 实现一个通用的文件下载器
- 编写一个天气查询程序
- 练习处理各种HTTP状态码和异常情况
提示:熟练掌握requests库是成为爬虫高手的基础,多练习各种场景下的使用方法。
希望对初学者有帮助;致力于办公自动化的小小程序员一枚
希望能得到大家的【❤️一个免费关注❤️】感谢!
求个 🤞 关注 🤞 +❤️ 喜欢 ❤️ +👍 收藏 👍
此外还有办公自动化专栏,欢迎大家订阅:Python办公自动化专栏
此外还有爬虫专栏,欢迎大家订阅:Python爬虫基础专栏
此外还有Python基础专栏,欢迎大家订阅:Python基础学习专栏
