"""标签:编码,自动识别,网页,encoding,utf,gbk,content,codes,return From: https://www.cnblogs.com/xuchunlin/p/17371875.html
该方法实现网页编码的自动识别和转换
"""
# python 第三方库chardet不可靠,把gbk编码解析成 Windows-1254
@retry(stop_max_attempt_number=5, wait_random_min=2000, wait_random_max=20000, )
def page_trancode(content):
codes = chardet.detect(content)
if codes['encoding'] == "utf-8":
return content
if codes['encoding'] == "gbk":
return content.decode('gbk', 'ignore').encode('utf-8')
if codes['encoding'] in "GB2312":
return str(BeautifulSoup(content, 'html.parser', fromEncoding="GBK"))
if codes['encoding'] in "unicode":
return content.encode('utf-8').decode('unicode_escape')
else:
return content