简介
mhtml文件又称为聚合html文档、web档案或单一文件网页。单个文件网页可将网站的所有元素(包括文本和图形)都保存到单个文件中。总的来说mht文件保存了一个网页内的所有元素,让用户可以在没有网络的情况下访问网页。
本程序提取mht文件中的图片并保存至新建文件夹,同时将其压缩。食用方法:
- 将代码复制存为python文件
- mht文件放在代码同级目录下
- 双击运行代码
随后即可在同级目录下生成图片文件夹和压缩文件夹。
实现效果
以下是保存的网页,是一章漫画。
运行程序截图:
>>>>>>> python -u "d:\CODE\extract_test\mht_img_extract.py"
[Find File] 第52话 - 想要成为影之实力者.mht
[File Nums] 1
------------------------------------------PROCESS [1/1]-----------------------------------
[File Name] 第52话 - 想要成为影之实力者.mht
[File Path] D:\CODE\extract_test
[New Folder] 第52话 - 想要成为影之实力者
[Saved Img] 36
[Raw/Compressed] 9.50 / 8.78 MB
文件保存截图:
实现思路
下面是截取mht文件部分内容:
------MultipartBoundary--K1usUkalTqg3WEUJLmLQ5xNVwipkHNGg7EXVdLUiFp----
Content-Type: image/jpeg
Content-Transfer-Encoding: base64
Content-Location: https://s1-a3-ussv.baozicdn.com/scomic/xiangyaochengweiyingzhishilizhe-fengzedajiebanyexingli/0/52-lros/1.jpg
/9j/4SSgRXhpZgAATU0AKgAAAAgADAEAAAMAAAABBFoAAAEBAAMAAAABBkAAAAECAAMAAAADAAAA
ngEGAAMAAAABAAIAAAESAAMAAAABAAEAAAEVAAMAAAABAAMAAAEaAAUAAAABAAAApAEbAAUAAAAB
AAAArAEoAAMAAAABAAIAAAExAAIAAAAhAAAAtAEyAAIAAAAUAAAA1YdpAAQAAAABAAAA7AAAASQA
CAAIAAgACvyAAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIDIyLjMgKE1hY2ludG9zaCkAMjAy
MzowNDozMCAwOTo1Njo1OQAAAAAABJAAAAcAAAAEMDIzMaABAAMAAAABAAEAAKACAAQAAAABAAAE
.................源文件base64编码太长,这里截取部分作为演示....................
+04k3NZyLMy+w+/+O38K9CgmjngSaJ90cihlb2PSuIkgjnieKVEaORdrKycMD/T+VT/De/kfQ7jR
bn/j50aZrX5uvldYj/3xigDt6KKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKK
ACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooA
in/495P9014r+z397xV/19r/AOzV7VP/AMe8n+6a8V/Z7+94q/6+1/8AZqAPb6KKKACiiigAoooo
AKKKKACkpaSgD//Z
------MultipartBoundary--K1usUkalTqg3WEUJLmLQ5xNVwipkHNGg7EXVdLUiFp----
Content-Type: image/jpeg
Content-Transfer-Encoding: base64
Content-Location: https://s1-a3-ussv.baozicdn.com/scomic/xiangyaochengweiyingzhishilizhe-fengzedajiebanyexingli/0/52-lros/6.jpg
从这里我们可以看出这是一个图片,使用base64编码。接下来我们就需要遍历mht文件,寻找------MultipartBoundary--K1usUkalTqg3WEUJLmLQ5xNVwipkHNGg7EXVdLUiFp----
。找到这一行,紧随其后就是一个图片文件的说明部分,据此我们可以获取图片的基本信息。
Content-Type
说明了图片文件类型Content-Transfer-Encoding
说明了文件编码方式Content-Location
说明了文件的网址
在base64编码的末尾有一个空行,读取到空行即说明遍历完了一张图片,接下来我们只需使用python的base64.b64decode()
即可还原图片,之后将其保存即可。当然其中还有许多细节问题,比如图片名获取、图片大小限制、空行判断问题等等,在以下代码中均已实现。
实现代码
import base64
import os
import zipfile
FILE_EXIST_AUTO_REWRITE = 1
IMG_NAME_REWRITE = 1
BASE64_LINE_THRESHOLD = 500
def saveImg(file_path, folder_path):
img_dic = dict() # key:img name, value:lines
skipped_dic = dict() # key:file name, value:encode
sifted_dic = dict() # key:img name, value:lines
with open(file_path, 'rb') as file:
line = file.readline().decode()
while line != '':
# seek to the header of base64 code
line = file.readline().decode()
if 'Boundary' in line and line[0] == '-':
content_type = file.readline().decode().replace(':', '').replace('/', ' ').split() # Content-Type
content_encode = file.readline().decode().replace(':', '').replace('/', ' ').split() # Content-Location
content_location = file.readline().decode().replace(':', '').replace('/', ' ').split() # Content-Transfer-Encoding
if content_type == [] or content_encode == [] or content_location == []:
continue
file_type = content_type[-1]
file_encode = content_encode[-1]
file_name = content_location[-1].split('?')[0]
img_name = ''
# file type is image
if content_type[1] == 'image' and file_encode == 'base64':
line = file.readline() # blank line after Content-Location
line = file.readline().decode().replace('\n', '') # first line of base64
base64_str = line
lines = 0
while True:
line = file.readline().decode()
lines += 1
base64_str += line.replace('\n', '')
if len(line) <= 2: # blank line after base64 code has 2 bytes
break
# img is too small
if lines <= BASE64_LINE_THRESHOLD:
sifted_dic[file_name] = lines
# img fit the threshold
else:
if IMG_NAME_REWRITE == 1:
img_name = str(len(img_dic)+1).zfill(4) + '.' + file_type # fill leading_zero
elif IMG_NAME_REWRITE == 0:
img_name = file_name.split('.')[0] + '.' + file_type
img_dic[img_name] = lines
base64_decode = base64.b64decode(base64_str)
img = open(folder_path + '/' + img_name, "wb")
img.write(base64_decode)
img.close()
# file type is not image
else:
skipped_dic[file_name] = file_encode
# reach the end of file
print('[Saved Img] %d'%(len(img_dic)))
# print('[Saved Img] [name:lines]: \n', img_dic)
# print('[Sifted Img] [name:lines]: \n', sifted_dic)
# print('Skipped [name:encode]: ', skipped_dic)
def getDirSize(dir):
size = 0
for root, dirs, files in os.walk(dir):
size += sum([os.path.getsize(os.path.join(root, name)) for name in files])
return size
def zipImg(folder_path, folder_name):
zip_name = folder_name+'.zip'
zip = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
for file in os.listdir(folder_path):
zip.write(folder_path + os.sep + file, file)
zip.close()
dir_size = getDirSize(folder_path)
zip_size = os.path.getsize(zip_name)
print(r'[Raw/Compressed] %.2f / %.2f MB'%(dir_size/1024/1024, zip_size/1024/1024))
def extractFile(FILE_NAME):
CUR_PATH = os.getcwd()
cur_file = os.path.join(CUR_PATH, FILE_NAME)
# file exist
if(os.path.isfile(cur_file)):
print("[File Name]", FILE_NAME)
print("[File Path]", CUR_PATH)
folder_name = '.'.join(FILE_NAME.split('.')[0:-1])
folder_path = os.path.join(CUR_PATH, folder_name)
# folder exist
if os.path.exists(folder_path):
if FILE_EXIST_AUTO_REWRITE == 1:
pass
elif FILE_EXIST_AUTO_REWRITE == 0:
print("Folder Exist:", folder_name, "Rewrite It? [y/n]", end=" ")
confirm = input()
if confirm == 'y':
pass
elif confirm == 'n':
return
# folder not exist, create it
else:
os.makedirs(os.path.join(CUR_PATH, folder_name))
print("[New Folder]", folder_name)
saveImg(cur_file, folder_path)
zipImg(folder_path, folder_name)
#file not exist, exit
else:
print("No This File!")
return
def getTargetFile():
file_list = os.listdir(os.getcwd())
target_file_list = []
for file in file_list:
if file.split('.')[-1] == 'mht':
target_file_list.append(file)
for file in target_file_list:
print('[Find File]', file)
print('[File Nums]', len(target_file_list))
return target_file_list
if __name__ == '__main__':
target_file_list = getTargetFile()
for index, file in enumerate(target_file_list):
print("\n------------------------------------------PROCESS [{}/{}]-----------------------------------".format(index+1, len(target_file_list)))
extractFile(file)
标签:content,提取,python,base64,Content,file,mht,line
From: https://www.cnblogs.com/PaintingSky/p/17789210.html