通过拉勾网,对它进行以下的操作
获取所有的a标签:
1 2 3 4 5 6 7 |
from lxml import etree parser = etree.HTMLParser(encoding="utf-8") html = etree.parse("test.html",parser=parser) a = html.xpath("//a") #xpath方法返回的是一个列表 for i in a: print(etree.tostring(i,encoding="utf-8").decode("utf-8")) |
获取第二个a标签:
1 2 3 4 5 |
from lxml import etree parser = etree.HTMLParser(encoding="utf-8") html = etree.parse("test.html",parser=parser) a = html.xpath("//a[2]")[0] print(etree.tostring(a,encoding="utf-8").decode("utf-8")) |
获取所有div标签中class属性等于modal fade upload-resume-modal的标签:
1 2 3 4 5 6 7 |
from lxml import etree parser = etree.HTMLParser(encoding="utf-8") html = etree.parse("test.html",parser=parser) a = html.xpath("//div[@class='modal fade upload-resume-modal']") #xpath方法返回的是一个列表 for i in a: print(etree.tostring(i,encoding="utf-8").decode("utf-8")) |
获取所有div标签的class属性:
1 2 3 4 5 6 7 8 |
from lxml import etree parser = etree.HTMLParser(encoding="utf-8") html = etree.parse("test.html",parser=parser) a = html.xpath("//div/@class") #xpath方法返回的是一个列表 for i in a: print(i) #这里不需要etree.tostring原因就是属性获取出来本来就是字符串了不需要再修改了 |
获取所有职位链接:
1 2 3 4 5 6 7 8 9 |
from lxml import etree parser = etree.HTMLParser(encoding="utf-8") html = etree.parse("test.html",parser=parser) a = html.xpath("//li[contains(@class,'con_list_item')]") #xpath方法返回的是一个列表 for i in a: href = i.xpath(".//a[@class='position_link']/@href") # 如果想要在已经提取出来的部分中再进行xpath解析需要在前面加上.,如果只是//还是会从html里去找,而不是从i中去找 print(href) |
