想把以前喜欢的帖子爬下来,代码存档于2024.4.26,不知道能用多久。
import requests from lxml import etree #移除链接、图片的标签 def removeTag(text): tree = etree.fromstring(text) for bad in tree.xpath("//a"): bad.getparent().remove(bad) for bad in tree.xpath("//img"): bad.getparent().remove(bad) return etree.tostring(tree, encoding="utf-8").decode('utf-8') #获取内容并清洗 def getContent(url): r = requests.get(url=url) r.encoding = "utf-8" html = etree.HTML(r.text) res1 = html.xpath("//div[@class='d_post_content j_d_post_content ']") res2 = [] for ele in res1: s = etree.tostring(ele, encoding="utf-8").decode('utf-8') s = removeTag(s) pos = s.find(">") s = s[pos + 1:] pos2 = 0 while s[pos2] == ' ': pos2 = pos2 + 1 s = s[pos2:] s = s.replace("</div>", "") s = s.replace("<br/>", "\n") s = s.replace("<strong>", "") s = s.replace("</strong>", "") s = s.replace("</span>", "") s = s.replace('''<span class="edit_font_color">''', "") s = s.replace("<", "") s = s.replace(">", "") # s = s.replace("</strong>", "") # s = s.replace("</span>", "") res2.append(s) return res2 #写回txt def writePage(text, filename): with open(filename, "a", encoding="utf-8") as f: for ele in text: f.write(ele + '\n') if __name__ == "__main__": #需要填写的东西 #这是帖子网址,记得把pn=后面的数字删了 url = "https://tieba.baidu.com/p/xxxxxxx?see_lz=1&pn=" #这是文件名 filename = "xxx.txt" #这是起止页数,爬虫爬到的应该是浏览器看到的页数的2倍 st=1 ed=10 for i in range(st, ed + 1): url2 = url + str(i) text = getContent(url2) if text == "null": break writePage(text, filename) print("end.")
贴吧17年死了一次后面又活了,但好些楼没恢复。那些初中水的贴闭着眼睛还能想起镇楼图什么样,一抬眼却要大学毕业了。
标签:utf,text,爬虫,replace,帖子,bad,etree,pos2,百度 From: https://www.cnblogs.com/capterlliar/p/18160892