Python爬虫第4课：XPath与lxml高级解析技术

🌸 欢迎来到Python办公自动化专栏—Python处理办公问题，解放您的双

💻 个人主页——>个人主页欢迎访问

😸 Github主页——>Github主页欢迎访问

❓ 知乎主页——>知乎主页欢迎访问

🏳️‍🌈 CSDN博客主页：请点击——> 一晌小贪欢的博客主页求关注

👍 该系列文章专栏：请点击——>Python办公自动化专栏求订阅

🕷 此外还有爬虫专栏：请点击——>Python爬虫基础专栏求订阅

📕 此外还有python基础专栏：请点击——>Python基础学习专栏求订阅

文章作者技术和水平有限，如果文中出现错误，希望大家能指正🙏

❤️ 欢迎各位佬关注！ ❤️

课程目标

掌握XPath语法和表达式编写
学会使用lxml库进行高效解析
理解XPath与CSS选择器的区别
掌握处理复杂XML和HTML文档的技巧

1. XPath简介

XPath（XML Path Language）是一种在XML和HTML文档中查找信息的语言。它使用路径表达式来选取文档中的节点或节点集。

1.1 XPath的优势

功能强大，表达能力强
支持复杂的条件查询
可以进行数学运算和字符串操作
浏览器原生支持

1.2 安装lxml

pip install lxml

2. XPath基本语法

2.1 路径表达式

from lxml import html, etree

# 示例HTML
html_content = """
<html>
<body>
    <div class="container">
        <h1 id="title">主标题</h1>
        <div class="content">
            <p>第一段</p>
            <p>第二段</p>
            <ul>
                <li>项目1</li>
                <li>项目2</li>
                <li>项目3</li>
            </ul>
        </div>
    </div>
</body>
</html>
"""

# 创建解析树
tree = html.fromstring(html_content)

# 基本路径表达式
# / : 从根节点选取
# // : 从任意位置选取
# . : 当前节点
# .. : 父节点
# @ : 属性

# 绝对路径
title = tree.xpath('/html/body/div/h1/text()')
print(title)  # ['主标题']

# 相对路径
paragraphs = tree.xpath('//p/text()')
print(paragraphs)  # ['第一段', '第二段']

2.2 节点选择

# 选择所有div元素
divs = tree.xpath('//div')

# 选择第一个div元素
first_div = tree.xpath('//div[1]')

# 选择最后一个li元素
last_li = tree.xpath('//li[last()]')

# 选择前两个li元素
first_two_li = tree.xpath('//li[position()<=2]')

# 选择所有有class属性的div
divs_with_class = tree.xpath('//div[@class]')

# 选择class为container的div
container = tree.xpath('//div[@class="container"]')

2.3 属性选择

html_content = """
<div class="article">
    <a href="https://example.com" title="示例">链接1</a>
    <a href="https://test.com" title="测试">链接2</a>
    <img src="image1.jpg" alt="图片1" width="300">
    <img src="image2.png" alt="图片2" width="400">
</div>
"""

tree = html.fromstring(html_content)

# 获取所有链接的href属性
hrefs = tree.xpath('//a/@href')
print(hrefs)  # ['https://example.com', 'https://test.com']

# 获取所有图片的src属性
srcs = tree.xpath('//img/@src')
print(srcs)  # ['image1.jpg', 'image2.png']

# 获取width大于300的图片
wide_images = tree.xpath('//img[@width>300]/@src')
print(wide_images)  # ['image2.png']

3. XPath高级语法

3.1 条件表达式

html_content = """
<div class="products">
    <div class="product" data-price="100">
        <h3>产品A</h3>
        <span class="price">￥100</span>
    </div>
    <div class="product" data-price="200">
        <h3>产品B</h3>
        <span class="price">￥200</span>
    </div>
    <div class="product" data-price="50">
        <h3>产品C</h3>
        <span class="price">￥50</span>
    </div>
</div>
"""

tree = html.fromstring(html_content)

# 价格大于100的产品
expensive_products = tree.xpath('//div[@data-price>100]/h3/text()')
print(expensive_products)  # ['产品B']

# 包含特定文本的元素
product_a = tree.xpath('//h3[text()="产品A"]')

# 使用contains函数
products_with_a = tree.xpath('//h3[contains(text(), "产品")]')

# 使用starts-with函数
price_elements = tree.xpath('//span[starts-with(@class, "price")]')

3.2 轴（Axes）

html_content = """
<div class="container">
    <div class="header">头部</div>
    <div class="content">
        <p>段落1</p>
        <p class="highlight">段落2</p>
        <p>段落3</p>
    </div>
    <div class="footer">底部</div>
</div>
"""

tree = html.fromstring(html_content)

# 获取highlight段落的父元素
parent = tree.xpath('//p[@class="highlight"]/parent::div')

# 获取highlight段落的前一个兄弟元素
preceding_sibling = tree.xpath('//p[@class="highlight"]/preceding-sibling::p/text()')
print(preceding_sibling)  # ['段落1']

# 获取highlight段落的后一个兄弟元素
following_sibling = tree.xpath('//p[@class="highlight"]/following-sibling::p/text()')
print(following_sibling)  # ['段落3']

# 获取所有祖先元素
ancestors = tree.xpath('//p[@class="highlight"]/ancestor::*')

# 获取所有后代元素
descendants = tree.xpath('//div[@class="content"]/descendant::*')

3.3 函数使用

# 文本函数
text_content = tree.xpath('//p[normalize-space(text())!=""]/text()')

# 字符串长度
long_text = tree.xpath('//p[string-length(text())>5]/text()')

# 位置函数
first_p = tree.xpath('//p[position()=1]/text()')
last_p = tree.xpath('//p[position()=last()]/text()')

# 计数函数
p_count = tree.xpath('count(//p)')
print(f"段落数量：{p_count}")

# 字符串函数
uppercase_text = tree.xpath('//p[contains(translate(text(), "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), "段落")]')

4. lxml库详解

4.1 解析HTML

from lxml import html
import requests

def parse_html_with_lxml(url):
    """使用lxml解析HTML"""
    response = requests.get(url)
    tree = html.fromstring(response.content)
  
    # 修复可能的HTML错误
    # lxml会自动修复一些HTML错误
  
    return tree

# 从字符串解析
html_string = "<div><p>Hello World</p></div>"
tree = html.fromstring(html_string)

# 从文件解析
tree = html.parse('example.html')

4.2 解析XML

from lxml import etree

xml_content = """
<?xml version="1.0" encoding="UTF-8"?>
<bookstore>
    <book id="1" category="fiction">
        <title>Python编程</title>
        <author>张三</author>
        <price>59.99</price>
    </book>
    <book id="2" category="technical">
        <title>数据结构</title>
        <author>李四</author>
        <price>79.99</price>
    </book>
</bookstore>
"""

# 解析XML
root = etree.fromstring(xml_content)

# 获取所有书籍标题
titles = root.xpath('//title/text()')
print(titles)  # ['Python编程', '数据结构']

# 获取技术类书籍
tech_books = root.xpath('//book[@category="technical"]/title/text()')
print(tech_books)  # ['数据结构']

# 获取价格大于60的书籍
expensive_books = root.xpath('//book[price>60]/title/text()')
print(expensive_books)  # ['数据结构']

4.3 命名空间处理

xml_with_ns = """
<?xml version="1.0"?>
<root xmlns:book="http://example.com/book"
      xmlns:author="http://example.com/author">
    <book:catalog>
        <book:item>
            <book:title>Python指南</book:title>
            <author:name>王五</author:name>
        </book:item>
    </book:catalog>
</root>
"""

root = etree.fromstring(xml_with_ns)

# 定义命名空间
namespaces = {
    'book': 'http://example.com/book',
    'author': 'http://example.com/author'
}

# 使用命名空间查询
titles = root.xpath('//book:title/text()', namespaces=namespaces)
authors = root.xpath('//author:name/text()', namespaces=namespaces)

print(titles)   # ['Python指南']
print(authors)  # ['王五']

5. 实战案例：爬取电商商品信息

import requests
from lxml import html
import csv
import time
import random

class ProductSpider:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
  
    def get_product_list(self, url, max_pages=5):
        """获取商品列表"""
        all_products = []
      
        for page in range(1, max_pages + 1):
            print(f"正在爬取第{page}页...")
          
            page_url = f"{url}?page={page}"
            products = self.parse_product_page(page_url)
          
            if not products:
                print("没有更多商品，停止爬取")
                break
          
            all_products.extend(products)
          
            # 随机延时，避免被反爬
            time.sleep(random.uniform(1, 3))
      
        return all_products
  
    def parse_product_page(self, url):
        """解析商品页面"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
          
            tree = html.fromstring(response.content)
            return self.extract_products(tree)
          
        except Exception as e:
            print(f"解析页面失败：{e}")
            return []
  
    def extract_products(self, tree):
        """提取商品信息"""
        products = []
      
        # 使用XPath定位商品容器
        product_nodes = tree.xpath('//div[@class="product-item"]')
      
        for node in product_nodes:
            product = self.extract_single_product(node)
            if product:
                products.append(product)
      
        return products
  
    def extract_single_product(self, node):
        """提取单个商品信息"""
        try:
            # 商品名称
            name_nodes = node.xpath('.//h3[@class="product-title"]/a/text()')
            name = name_nodes[0].strip() if name_nodes else ''
          
            # 商品链接
            link_nodes = node.xpath('.//h3[@class="product-title"]/a/@href')
            link = link_nodes[0] if link_nodes else ''
          
            # 商品价格
            price_nodes = node.xpath('.//span[@class="price"]/text()')
            price = price_nodes[0].strip() if price_nodes else ''
          
            # 商品评分
            rating_nodes = node.xpath('.//div[@class="rating"]/@data-rating')
            rating = rating_nodes[0] if rating_nodes else ''
          
            # 评论数
            review_nodes = node.xpath('.//span[@class="review-count"]/text()')
            review_count = review_nodes[0].strip() if review_nodes else ''
          
            # 商品图片
            img_nodes = node.xpath('.//img[@class="product-img"]/@src')
            image_url = img_nodes[0] if img_nodes else ''
          
            # 店铺名称
            shop_nodes = node.xpath('.//span[@class="shop-name"]/text()')
            shop_name = shop_nodes[0].strip() if shop_nodes else ''
          
            return {
                'name': name,
                'link': link,
                'price': price,
                'rating': rating,
                'review_count': review_count,
                'image_url': image_url,
                'shop_name': shop_name
            }
          
        except Exception as e:
            print(f"提取商品信息失败：{e}")
            return None
  
    def get_product_detail(self, product_url):
        """获取商品详情"""
        try:
            response = self.session.get(product_url, timeout=10)
            response.raise_for_status()
          
            tree = html.fromstring(response.content)
          
            # 详细描述
            desc_nodes = tree.xpath('//div[@class="product-description"]//text()')
            description = ''.join(desc_nodes).strip()
          
            # 规格参数
            specs = {}
            spec_rows = tree.xpath('//table[@class="specs-table"]//tr')
            for row in spec_rows:
                key_nodes = row.xpath('./td[1]/text()')
                value_nodes = row.xpath('./td[2]/text()')
                if key_nodes and value_nodes:
                    specs[key_nodes[0].strip()] = value_nodes[0].strip()
          
            # 商品图片列表
            image_nodes = tree.xpath('//div[@class="product-images"]//img/@src')
            images = [img for img in image_nodes if img]
          
            return {
                'description': description,
                'specifications': specs,
                'images': images
            }
          
        except Exception as e:
            print(f"获取商品详情失败：{e}")
            return {}
  
    def save_to_csv(self, products, filename='products.csv'):
        """保存到CSV文件"""
        if not products:
            print("没有数据需要保存")
            return
      
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['name', 'link', 'price', 'rating', 'review_count', 
                         'image_url', 'shop_name']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
          
            writer.writeheader()
            for product in products:
                writer.writerow(product)
      
        print(f"数据已保存到 {filename}")

# 使用示例
if __name__ == "__main__":
    spider = ProductSpider()
  
    # 爬取商品列表
    products = spider.get_product_list('https://example-shop.com/search?q=手机')
  
    # 保存基本信息
    spider.save_to_csv(products)
  
    # 获取前5个商品的详细信息
    for i, product in enumerate(products[:5]):
        print(f"获取第{i+1}个商品的详细信息...")
        detail = spider.get_product_detail(product['link'])
        product.update(detail)
        time.sleep(random.uniform(1, 2))

6. XPath调试技巧

6.1 浏览器调试

# 在浏览器开发者工具中测试XPath
# 1. 按F12打开开发者工具
# 2. 在Console中输入：
# $x('//div[@class="content"]')  // 测试XPath表达式
# $x('//div[@class="content"]')[0]  // 获取第一个匹配元素

6.2 Python调试

def debug_xpath(tree, xpath_expr):
    """调试XPath表达式"""
    try:
        result = tree.xpath(xpath_expr)
        print(f"XPath: {xpath_expr}")
        print(f"结果数量: {len(result)}")
      
        if result:
            print("前3个结果:")
            for i, item in enumerate(result[:3]):
                if hasattr(item, 'text'):
                    print(f"  {i+1}: {item.text}")
                elif hasattr(item, 'tag'):
                    print(f"  {i+1}: <{item.tag}>")
                else:
                    print(f"  {i+1}: {item}")
        else:
            print("没有找到匹配的元素")
          
    except Exception as e:
        print(f"XPath表达式错误: {e}")

# 使用示例
tree = html.fromstring(html_content)
debug_xpath(tree, '//div[@class="product"]//h3/text()')

7. 性能优化

7.1 选择合适的解析器

# lxml比BeautifulSoup更快
from lxml import html
import time

def benchmark_parsers(html_content, iterations=1000):
    """比较解析器性能"""
  
    # lxml
    start_time = time.time()
    for _ in range(iterations):
        tree = html.fromstring(html_content)
        titles = tree.xpath('//h1/text()')
    lxml_time = time.time() - start_time
  
    print(f"lxml: {lxml_time:.4f}秒")
  
    # BeautifulSoup
    from bs4 import BeautifulSoup
    start_time = time.time()
    for _ in range(iterations):
        soup = BeautifulSoup(html_content, 'html.parser')
        titles = soup.find_all('h1')
    bs4_time = time.time() - start_time
  
    print(f"BeautifulSoup: {bs4_time:.4f}秒")
    print(f"lxml比BeautifulSoup快 {bs4_time/lxml_time:.2f}倍")

7.2 XPath优化技巧

# 优化前：低效的XPath
slow_xpath = '//*[@class="content"]//*[@class="item"]//*[@class="title"]'

# 优化后：更具体的路径
fast_xpath = '//div[@class="content"]//div[@class="item"]/h3[@class="title"]'

# 使用索引而不是last()
# 慢：//li[last()]
# 快：//li[3] (如果知道具体位置)

# 避免使用//开头，如果知道具体路径
# 慢：//div//span//text()
# 快：/html/body/div/span/text()

8. 实践练习

练习1：爬取新闻网站

使用XPath爬取新闻网站的文章列表，提取标题、链接、发布时间等信息。

练习2：解析XML数据

处理一个包含商品信息的XML文件，提取所有商品的详细信息。

练习3：复杂表格解析

使用XPath解析包含合并单元格的复杂HTML表格。

9. 课程小结

本课程我们学习了：

XPath语法和基本表达式
XPath高级功能和函数
lxml库的使用方法
XML和HTML的解析技巧
命名空间处理
实战案例和性能优化

10. 下节预告

下一课我们将学习：

正则表达式在爬虫中的应用
数据清洗和预处理技术
处理各种数据格式
数据验证和质量控制

11. 作业

使用XPath爬取一个电商网站的商品信息
练习编写复杂的XPath表达式
比较XPath和CSS选择器的性能差异
处理包含命名空间的XML文档

提示：XPath是强大的数据提取工具，熟练掌握其语法可以大大提高爬虫的效率和准确性。

希望对初学者有帮助；致力于办公自动化的小小程序员一枚

希望能得到大家的【❤️一个免费关注❤️】感谢！

求个 🤞 关注 🤞 +❤️ 喜欢 ❤️ +👍 收藏 👍

此外还有办公自动化专栏，欢迎大家订阅：Python办公自动化专栏

此外还有爬虫专栏，欢迎大家订阅：Python爬虫基础专栏

此外还有Python基础专栏，欢迎大家订阅：Python基础学习专栏