from lxml import etree# 1.专业术语"""
树:整个HTML或者xml结构
节点:HTML中的每个标签,xml中标签就是节点
根节点:树的第一个节点,HTML的根节点就是HTML标签
属性:节点属性(html中就是标签属性)
"""# 2.xml数据格式
# json数据和xml数据是两种通用的数据格式,用于不同语言之间的数据交流# 1). 准备数据
xml_str = """
<supermaket><name>永辉超市</name><address>肖家河大厦</address><goodslist><goods name="泡面" price="3.5" count="20"></goods><goods name="矿泉水" price="2" count="50"></goods><goods name="面包" price="5" count="15"></goods></goodslist><goods price="35" count="20"><name>烟</name></goods><worker_list><cashier name="张三" pay="4000"></cashier><shoppingGuide name="李四" pay="3500"></shoppingGuide></worker_list>
</supermaket>
"""# 2). 创建树对象,并获取数据的根节点supermarket = etree.XML(xml_str)
# print(supermarket)# 3). 获取节点
# 节点对象.xpath(路径) - 根据路径找到对应的节点,返回节点对象
# a.写绝对路径,不管xpath前面的节点对象是什么,路径从根节点开始写
# 写法:/绝对路径
cashier = supermarket.xpath('/supermaket/worker_list/cashier')
print(cashier) # [<Element cashier at 0x26a1eb29200>]worker_list = supermarket.xpath('/supermaket/worker_list')[0]
print(worker_list) # <Element worker_list at 0x26a1eb29180>result = worker_list.xpath('/worker_list/cashier')
print(result) # []# b.相对路径:用.来表示当前节点
# 注意:./可以省略
cashier = supermarket.xpath('./worker_list/cashier')
print(cashier) # [<Element cashier at 0x1a7a4ab9280>]cashier = worker_list.xpath('./cashier')
print(cashier) # [<Element cashier at 0x2432e913f00>]cashier = worker_list.xpath('cashier')
print(cashier) # [<Element cashier at 0x2432e913f00>]# c.//路径 - 从任意位置开始全局搜索
# 查找方式和功能和xpath前的节点无关result = supermarket.xpath('//cashier')
print(result)
# [<Element cashier at 0x10616ecc0>. ]
result = supermarket.xpath('//goods')
print(result)
# [<Element goods at 0x2220e5c9380>, <Element goods at 0x2220e5c93c0>, <Element goods at 0x2220e5c9400>]
result = supermarket.xpath('//goodslist/goods')
print(result)
# [<Element goods at 0x2220e5c9380>, <Element goods at 0x2220e5c93c0>, <Element goods at 0x2220e5c9400>]# 4)获取节点内容
# 语法:获取节点的路径/text()
name = supermarket.xpath('./name/text()')
print(name)names = supermarket.xpath('//name/text()')
print(names)# 5)获取节点属性值
# 语法:获取节点的路径/@属性名
cashier = supermarket.xpath('//goods/@price')
print(cashier)
from lxml import etreehtml = etree.HTML(open('files/test.html', encoding='utf-8').read())h1 = html.xpath('//h1/text()')
print(h1)h1 = html.xpath('./body/h1/text()')
print(h1)# 1.加谓语(加条件)
# 选中标签的路径[谓语]
# a.[N] - 获取同级的相同标签的第N个
p = html.xpath('./body/p[1]/text()')
print(p)result = html.xpath('//li[1]/p/text()')
print(result)# 2)
# [last()] - 获取同层的最后一个标签
# [last()-N] - 获取同层的倒数第(N+1)个
result = html.xpath('./body/ul/li[last()-1]/p[last()]/text()')
print(result)# 3)
# [position()>N]
# [position()<N]
# [position()>=N]
# [position()<=N]
result = html.xpath('./body/ul/li[position()<=2]/p/text()')
print(result)# 4) [@属性名] - 获取有指定属性的标签
# p[@class] - 有class属性的p标签
result = html.xpath('./body/div/p[@class]/text()')
print(result)# [@属性名='值'] - 获取有指定属性是指定值的标签
result = html.xpath('./body/div/p[@class="b"]/text()')
print(result)# 5)
# [标签 >/</>=/<=/= 数据] - 将标签按照指定子标签的内容进行筛选result = html.xpath('./body/ul/li[p[2]>4]/p/text()')
print(result)result = html.xpath('./body/ul/li[p[3]>30]/p[1]/text()')
print(result)result = html.xpath('./body/ul/li[p[1] = "面包"]/p/text()')
print(result)# 2.通配符 :*
# 1) 表示任意标签
result = html.xpath('./body/div[@id="div1"]/*')
print(result)result = html.xpath('./body/div[@id="div1"]/*[@class]')
print(result)result = html.xpath('//*[@class="c1"]/text()')
print(result)# 2) 表示任意属性
result = html.xpath('./body/div[last()]/p[@*]/text()')
print(result)# 3. 分支(获取若干路径) - |
# 注意:一个竖线隔开的必须是两个独立的路径
result = html.xpath('./body/ul/li/p[1]/text()|./body/ul/li/p[2]/text()')
print(result)
<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>商店</title>
</head>
<body>
<h1>永辉超市</h1><p>肖家河大厦</p><p>营业中</p><ul><li><p class="name">泡面</p><p class="price">3.5</p><p class="count">15</p></li><li><p class="name">矿泉水</p><p class="price">2</p><p class="count">120</p></li><li><p class="name">面包</p><p class="price">5</p><p class="count">42</p></li><li><p class="name">充电宝</p><p class="price">150</p><p class="count">10</p></li></ul><div><p id="a">p1</p><p class="b">p2</p><p class="c1">p3</p><p class="d">p4</p></div><div id="div1"><p class="c1">p1</p><p id="p2">p2</p><a href="">a1</a><span class="c1">span1</span><img src="=http%3A%2F%2Fbpic.588ku%2Felement_origin_min_pic%2F17%2F06%2F13%2F5c5a1442f0ec72e59829ee10d891f224.jpg%21r650&refer=http%3A%2F%2Fbpic.588ku&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1631690803&t=ddfb673477426b3255f364e59966b2f1"></div></body>
</html>
本文发布于:2024-02-03 01:06:18,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170689358847648.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |