问题
当爬取 linovelib 上的小说正文时,发现提取出来的内容中存在无法正常显示的乱码字符,如下所示:
注:上述内容来源 这里
猜测
- 猜测一:解析时使用的字符编码与源网址不一致;
- 猜测二:乱码字符为该文件内专属的字符,即猜测该网址使用了字体防爬;
- 猜测三:网站内的js脚本对正文内容进行了再次加工,实际的加密信息存放在js脚本中
分析
看到乱码首先判断是否字符编码异常所致,遂有了猜测一。
通过查看网址的编码,发现其使用的是标准的utf-8
编码,当我指定采用utf-8
编码解析内容时,上述的乱码问题依旧存在,于是判断猜测一有误。
猜测一被否后,于是开始分析网址加载的资源文件,发现其加载了一个名为:icomoon.ttf
的字体文件,遂有了猜测二。
后续通过字体软件解析该文件时并未发现相关的文字信息,于是判断猜测二有误。
上述猜测一一被否决后,开始产生一个疑惑:其正文的html文档中的结构和字体显示时正常的,但是通过网址抓取时却显示异常;遂有了猜测三。
详细的分析过程
首先将当前网页的所有信息保存到本地的html文件中,逐步删减其中的js脚本,发现将<script>yuedu();</script>
该行js代码删除后,页面将显示乱码的正文内容,于是可以断定,处理乱码的规则必定在yuedu()
该js函数中。
后续需查找该函数的内容,通过分析浏览器控制台 Sources 下的js文件,发现该函数的内容如下:
function yuedu() {
......
var d, e, f, g, h, i, j, k;
......
k = document.getElementById("TextContent").innerHTML,
//k = k.replace(new RegExp(" ", "gi"), "<p>").replace(new RegExp("<br><br>", "gi"), "</p>").replace(new RegExp("<br>\n<br>", "gi"), "</p>"),
k = eval(function(p,a,c,k,e,d){e=function(c){return(c<a?"":e(parseInt(c/a)))+((c=c%a)>35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1;};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p;}('k.1(2 0(" ","3"),"<p>").1(2 0("<p>\\n</p>","3"),"<4>").1(2 0("<p> </p>","3"),"<4>").1(2 0("<p></p>","3"),"<4>").1(2 0("","3"),"\\F").1(2 0("","3"),"\\E").1(2 0("","3"),"\\D").1(2 0("","3"),"\\I").1(2 0("","3"),"\\H").1(2 0("","3"),"\\G").1(2 0("","3"),"\\z").1(2 0("","3"),"\\y").1(2 0("","3"),"\\x").1(2 0("","3"),"\\C").1(2 0("","3"),"\\B").1(2 0("","3"),"\\A").1(2 0("","3"),"\\R").1(2 0("","3"),"\\Q").1(2 0("","3"),"\\P").1(2 0("","3"),"\\U").1(2 0("","3"),"\\T").1(2 0("","3"),"\\S").1(2 0("","3"),"\\L").1(2 0("","3"),"\\K").1(2 0("","3"),"\\J").1(2 0("","3"),"\\O").1(2 0("","3"),"\\N").1(2 0("","3"),"\\M").1(2 0("","3"),"\\w").1(2 0("","3"),"\\d").1(2 0("","3"),"\\c").1(2 0("","3"),"\\b").1(2 0("","3"),"\\g").1(2 0("","3"),"\\f").1(2 0("","3"),"\\e").1(2 0("","3"),"\\7").1(2 0("","3"),"\\6").1(2 0("","3"),"\\5").1(2 0("","3"),"\\a").1(2 0("","3"),"\\9").1(2 0("","3"),"\\8").1(2 0("","3"),"\\s").1(2 0("","3"),"\\r").1(2 0("","3"),"\\q").1(2 0("","3"),"\\v").1(2 0("","3"),"\\u").1(2 0("","3"),"\\t").1(2 0("","3"),"\\j").1(2 0("","3"),"\\i").1(2 0("","3"),"\\h").1(2 0("","3"),"\\o").1(2 0("","3"),"\\m").1(2 0("","3"),"\\l").1(2 0("","3"),"\\V").1(2 0("","3"),"\\1t").1(2 0("","3"),"\\1s").1(2 0("","3"),"\\1r").1(2 0("","3"),"\\1w").1(2 0("","3"),"\\1v").1(2 0("","3"),"\\1u").1(2 0("","3"),"\\1n").1(2 0("","3"),"\\1m").1(2 0("","3"),"\\1l").1(2 0("","3"),"\\1q").1(2 0("","3"),"\\1p").1(2 0("","3"),"\\1o").1(2 0("","3"),"\\1x").1(2 0("","3"),"\\1G").1(2 0("","3"),"\\1F").1(2 0("","3"),"\\1E").1(2 0("","3"),"\\1J").1(2 0("","3"),"\\1I").1(2 0("","3"),"\\1H").1(2 0("","3"),"\\1A").1(2 0("","3"),"\\1z").1(2 0("","3"),"\\1y").1(2 0("","3"),"\\1D").1(2 0("","3"),"\\1C").1(2 0("","3"),"\\1B").1(2 0("","3"),"\\14").1(2 0("","3"),"\\13").1(2 0("","3"),"\\12").1(2 0("","3"),"\\17").1(2 0("","3"),"\\16").1(2 0("","3"),"\\15").1(2 0("","3"),"\\Y").1(2 0("","3"),"\\X").1(2 0("","3"),"\\W").1(2 0("","3"),"\\11").1(2 0("","3"),"\\10").1(2 0("","3"),"\\Z").1(2 0("","3"),"\\18").1(2 0("","3"),"\\1h").1(2 0("","3"),"\\1g").1(2 0("","3"),"\\1f").1(2 0("","3"),"\\1k").1(2 0("","3"),"\\1j").1(2 0("","3"),"\\1i").1(2 0("","3"),"\\1b").1(2 0("","3"),"\\1a").1(2 0("","3"),"\\19").1(2 0("","3"),"\\1e").1(2 0("","3"),"\\1d").1(2 0("","3"),"\\1c");',62,108,'RegExp|replace|new|gi|br|u51fa|u5979|u8981|u91cc|u5f97|u4e5f|u7740|u5e74|u56fd|u548c|u90a3|u5c31|u8fc7|u800c|u4e0b||u80fd|u53bb||u5929||u4ee5|u81ea|u540e|u53ef|u5bb6|u4f1a|u751f|u4ed6|u5728|u4eba|u4e2a|u8fd9|u6709|u662f|u4e00|u7684|u4e0d|u6211|u4e86|u5b50|u4e3a|u5730|u8bf4|u4f60|u4e2d|u6765|u4eec|u4e0a|u5927|u65f6|u5230|u5bf9|u830e|u6db2|u9634|u8089|u547b|u6b32|u79cd|u4f5c|u60f3|u4e73|u7f8e|u5f00|u4ea4|u8131|u5c04|u8214|u5507|u9a9a|u88f8|u79c1|u80f8|u6027|u81c0|u6deb|u7a74|u90fd|u4e4b|u4e48|u8d77|u770b|u597d|u7136|u591a|u5c0f|u5b66|u5fc3|u4e8e|u53d1|u7528|u8fd8|u628a|u9053|u6837|u7b2c|u6210|u6ca1|u5f53|u4e8b|u5982|u53ea'.split('|'),0,{}))
document.getElementById("TextContent").innerHTML = k
}
分析上面的js脚本可以得出结论,该脚本使用 replace()
方法将正文的乱码内容逐步替换为符合 utf-8
编码的文字。
提取对应的字典信息如下:
通过上述分析,linovelib
小说的乱码问题已经解决。
处理脚本
为方便使用,此处贴出我的python脚本,仅作参考:
linovelib_content_dict = {
" ": "<p>", "<p>\\n</p>": "<br>", "<p> </p>": "<br>", "<p></p>": "<br>",
"": "\u7684", "": "\u4e00", "": "\u662f", "": "\u4e86", "": "\u6211",
"": "\u4e0d", "": "\u4eba", "": "\u5728", "": "\u4ed6", "": "\u6709",
"": "\u8fd9", "": "\u4e2a", "": "\u4e0a", "": "\u4eec", "": "\u6765",
"": "\u5230", "": "\u65f6", "": "\u5927", "": "\u5730", "": "\u4e3a",
"": "\u5b50", "": "\u4e2d", "": "\u4f60", "": "\u8bf4", "": "\u751f",
"": "\u56fd", "": "\u5e74", "": "\u7740", "": "\u5c31", "": "\u90a3",
"": "\u548c", "": "\u8981", "": "\u5979", "": "\u51fa", "": "\u4e5f",
"": "\u5f97", "": "\u91cc", "": "\u540e", "": "\u81ea", "": "\u4ee5",
"": "\u4f1a", "": "\u5bb6", "": "\u53ef", "": "\u4e0b", "": "\u800c",
"": "\u8fc7", "": "\u5929", "": "\u53bb", "": "\u80fd", "": "\u5bf9",
"": "\u5c0f", "": "\u591a", "": "\u7136", "": "\u4e8e", "": "\u5fc3",
"": "\u5b66", "": "\u4e48", "": "\u4e4b", "": "\u90fd", "": "\u597d",
"": "\u770b", "": "\u8d77", "": "\u53d1", "": "\u5f53", "": "\u6ca1",
"": "\u6210", "": "\u53ea", "": "\u5982", "": "\u4e8b", "": "\u628a",
"": "\u8fd8", "": "\u7528", "": "\u7b2c", "": "\u6837", "": "\u9053",
"": "\u60f3", "": "\u4f5c", "": "\u79cd", "": "\u5f00", "": "\u7f8e",
"": "\u4e73", "": "\u9634", "": "\u6db2", "": "\u830e", "": "\u6b32",
"": "\u547b", "": "\u8089", "": "\u4ea4", "": "\u6027", "": "\u80f8",
"": "\u79c1", "": "\u7a74", "": "\u6deb", "": "\u81c0", "": "\u8214",
"": "\u5c04", "": "\u8131", "": "\u88f8", "": "\u9a9a", "": "\u5507"}
def parse_garbled_text(content: str) -> str:
'''处理文字乱码'''
for k, v in linovelib_content_dict.items():
content = content.replace(k, v)
return content
标签:content,记录,replace,乱码,linovelib,js,猜测
From: https://www.cnblogs.com/garbler/p/17262350.html