废话不多说,直接上代码!!!
初衷是由于在解析HTML时使用html2text库的时候,感觉表格解析的不尽人意呀!还尝试过pypandoc,感觉也一般般,最后还是使用bs4自己解析~
代码优化点:
① 解析HTML表格转化为标准markdown
② 解析HTML图片转化为标准markdown
③ 解析HTML中的标题段落换行等
④ 解析适应非HTML的文本传入
from bs4 import BeautifulSoup
def clean_html_tag_with_bs4(html: str) -> str:
soup = BeautifulSoup(html, 'html.parser')
# 转换图片链接为 Markdown 格式
def convert_images(img_tag):
src = img_tag.get('src')
alt = img_tag.get('alt', '')
return f'![{alt}]({src})'
# 转换表格为 Markdown 格式
def convert_table(table_tag):
# 获取表格内容
# table = soup.find('table')
all_rows = table_tag.find_all('tr')
# 处理空表格的情况
if not all_rows:
return '' # 直接忽略空表格
# return '| Empty Table |\n|---|\n' # 返回一个空表格的 markdown 格式
# 获取表头和确定实际列数
headers = []
total_cols = 0
first_row = all_rows[0]
if first_row.find('th'):
# 计算表头实际列数(考虑 colspan)
for th in first_row.find_all('th'):
colspan = int(th.get('colspan', 1))
header_text = th.get_text(strip=True)
# 对于合并的列,使用相同的表头文本或添加序号
for i in range(colspan):
if colspan > 1:
headers.append(f'{header_text}' if i > 0 else header_text)
# headers.append(f'{header_text}_{i+1}' if i > 0 else header_text)
else:
headers.append(header_text)
total_cols = len(headers)
rows = all_rows[1:]
else:
# 分析第一行确定实际列数
first_row_cells = first_row.find_all('td')
has_rowspan = any(cell.get('rowspan') for cell in first_row_cells)
if has_rowspan:
# 如果第一行包含 rowspan,使用序号作为表头
total_cols = 0
for cell in first_row_cells:
colspan = int(cell.get('colspan', 1))
total_cols += colspan
# headers = [f'Column {i+1}' for i in range(total_cols)]
headers = [f'标题 {i+1}' for i in range(total_cols)]
rows = all_rows # 包含第一行的所有数据
else:
# 使用第一行作为表头
total_cols = 0
for cell in first_row_cells:
colspan = int(cell.get('colspan', 1))
cell_text = cell.get_text(strip=True)
for i in range(colspan):
if colspan > 1:
headers.append(f'{cell_text}' if i > 0 else cell_text)
# headers.append(f'{cell_text}_{i+1}' if i > 0 else cell_text)
else:
headers.append(cell_text)
total_cols += colspan
rows = all_rows[1:] # 数据从第二行开始
# 确保 total_cols 不为 0
if total_cols == 0:
total_cols = max(len(row.find_all('td')) for row in all_rows)
headers = [f'标题 {i+1}' for i in range(total_cols)]
# 初始化表格数据和缓存
table_data = []
rowspan_cache = {}
# 处理每一行的内容
for i, row in enumerate(rows):
cols = row.find_all('td')
row_data = ['' for _ in range(total_cols)]
col_index = 0
# 处理之前的 rowspan 数据
for idx in list(rowspan_cache.keys()):
merged_value, remaining_rowspan, span_width = rowspan_cache[idx]
# 填充所有合并的列
for span_idx in range(idx, idx + span_width):
if span_idx < total_cols: # 确保不超出总列数
row_data[span_idx] = merged_value
remaining_rowspan -= 1
if remaining_rowspan == 0:
del rowspan_cache[idx]
else:
rowspan_cache[idx] = (merged_value, remaining_rowspan, span_width)
for col in cols:
# 跳过已经被 rowspan 填充的列
while col_index < total_cols and row_data[col_index]:
col_index += 1
if col_index >= total_cols:
break
rowspan = int(col.get('rowspan', 1))
colspan = int(col.get('colspan', 1))
cell_value = col.get_text(strip=True)
# 处理 rowspan
if rowspan > 1:
rowspan_cache[col_index] = (cell_value, rowspan - 1, colspan)
# 填充当前单元格和colspan
for span_idx in range(col_index, min(col_index + colspan, total_cols)):
row_data[span_idx] = cell_value
col_index += colspan
table_data.append(row_data)
# 输出调试信息,查看每行数据
for row in table_data:
print(f"Row data: {row} (Length: {len(row)})")
# 生成 Markdown 格式的表格
# 表头行的 Markdown 格式
markdown_result = "| " + " | ".join(headers) + " |\n"
# 表头分隔符的 Markdown 格式,默认分割符为 ---
# markdown_result += "| " + " | ".join(['-' * len(header) for header in headers]) + " |\n"
markdown_result += "| " + " | ".join(['-' * 3 for _ in headers]) + " |\n"
# 添加每一行数据到 Markdown 表格中
for row in table_data:
markdown_result += "| " + " | ".join(row) + " |\n"
# 输出最终生成的 Markdown 格式表格
print(markdown_result)
return markdown_result
# 处理 HTML 中的所有元素
for img_tag in soup.find_all('img'):
img_tag.replace_with(convert_images(img_tag))
# 保存所有表格的位置和对应的 markdown
table_placeholders = {}
for i, table_tag in enumerate(soup.find_all('table')):
placeholder = f'TABLE_PLACEHOLDER_{i}'
table_placeholders[placeholder] = convert_table(table_tag)
table_tag.replace_with(placeholder)
# 处理文本格式
# 处理换行
for br in soup.find_all('br'):
br.replace_with('\n')
# 处理段落
for p in soup.find_all('p'):
p.insert_after(soup.new_string('\n\n'))
# 处理标题
for i in range(1, 7):
for h in soup.find_all(f'h{i}'):
# h.string = f"{'#' * i} {h.get_text()}\n"
h.string = f"{h.get_text()}\n"
# 处理列表
for ul in soup.find_all('ul'):
for li in ul.find_all('li'):
# li.insert_before(soup.new_string('* '))
li.insert_after(soup.new_string('\n'))
for ol in soup.find_all('ol'):
for li in ol.find_all('li'):
# 有序列表前面加上序号从1开始
# for i, li in enumerate(ol.find_all('li'), 1):
# li.insert_before(soup.new_string(f'{i}. '))
li.insert_after(soup.new_string('\n'))
# 处理加粗和斜体
for strong in soup.find_all(['strong', 'b']):
strong.string = strong.get_text()
for em in soup.find_all(['em', 'i']):
em.string = em.get_text()
# 获取处理后的文本
text = soup.get_text()
# 清理多余的空行
lines = text.split('\n')
cleaned_lines = []
for line in lines:
stripped_line = line.strip()
if stripped_line or cleaned_lines and cleaned_lines[-1]:
cleaned_lines.append(stripped_line)
text = '\n'.join(cleaned_lines)
# 恢复表格
for placeholder, table_markdown in table_placeholders.items():
text = text.replace(placeholder, table_markdown)
return text.strip()
if __name__ == "__main__":
# 测试 HTML
html_content = """
<html>
<body>
<table border="1">
<thead>
<tr>
<th>标题1</th>
<th>标题2</th>
<th>标题3</th>
<th>标题4</th>
<th>标题5</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="3" colspan="2">合并单元格1</td>
<td>内容1</td>
<td>内容2</td>
<td>内容3</td>
</tr>
<tr>
<td>内容4</td>
<td>内容5</td>
<td>内容6</td>
</tr>
<tr>
<td>内容7</td>
<td>内容8</td>
<td>内容9</td>
</tr>
<tr>
<td>行4</td>
<td>内容10</td>
<td colspan="2">合并单元格2</td>
<td>内容11</td>
</tr>
</tbody>
</table>
</body>
</html>
"""
# 转换 HTML 为 Markdown 格式的文本
markdown_output = clean_html_tag_with_bs4(html_content)
print(markdown_output)
标签:rowspan,HTML,表格,text,cols,markdown,colspan,find,row
From: https://blog.csdn.net/weixin_42075141/article/details/145061561