该方法实现网页编码的自动识别和转换

发布时间 2023-05-04 17:11:24作者: 淋哥
"""
该方法实现网页编码的自动识别和转换
"""

# python 第三方库chardet不可靠,把gbk编码解析成 Windows-1254
@retry(stop_max_attempt_number=5, wait_random_min=2000, wait_random_max=20000, )
def page_trancode(content):

codes = chardet.detect(content)

if codes['encoding'] == "utf-8":
return content
if codes['encoding'] == "gbk":
return content.decode('gbk', 'ignore').encode('utf-8')
if codes['encoding'] in "GB2312":
return str(BeautifulSoup(content, 'html.parser', fromEncoding="GBK"))
if codes['encoding'] in "unicode":
return content.encode('utf-8').decode('unicode_escape')
else:
return content