def get_content_and_images(page_tree, div_name, div_classes): content = '' img_href_list = [] for div_class in div_classes: # 获取内容和图片/视频链接 new_content = ''.join(page_tree.xpath(f'//{div_name}[contains(@class, "{div_class}")]//p//text()')) new_img_href_list = page_tree.xpath( f"//{div_name}[contains(@class, '{div_class}')]//a[contains(@href,'.docx')]/@href | " f"//{div_name}[contains(@class, '{div_class}')]//a[contains(@href,'.doc')]/@href | " f"//{div_name}[contains(@class, '{div_class}')]//a[contains(@href,'.xlsx')]/@href | " f"//{div_name}[contains(@class, '{div_class}')]//a[contains(@href,'.xls')]/@href | " f"//{div_name}[contains(@class, '{div_class}')]//a[contains(@href,'.pdf')]/@href | " f"//{div_name}[contains(@class, '{div_class}')]//a[contains(@href,'.zip')]/@href | " f"//{div_name}[contains(@class, '{div_class}')]//img[contains(@src,'.jpg')]/@src | " f"//{div_name}[contains(@class, '{div_class}')]//img[contains(@src,'.png')]/@src" ) # # 如果已经有内容则跳出循环 if new_content!='' or new_img_href_list!=[]: content = new_content img_href_list = new_img_href_list break return content, img_href_list div_name = 'div' div_classes = ['xinagxizhengwen'] content, img_href_list = get_content_and_images(page_tree, div_name, div_classes)
标签:提取,name,img,contains,链接,href,文章,div,class From: https://www.cnblogs.com/wolvies/p/18382832