D:\code_gitee\python_get_msedge_history\新建文件夹\cnblog2anki.py
from base64 import encode
import os
import re
import shutil
import easygui
import requests
from subprocess import run
from easygui import *
from bs4 import BeautifulSoup
def get_html_content(link):
# link = 'https://www.cnblogs.com/zhuoss/p/16909370.html'
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
}
res = requests.get(url=link, headers=headers).text
soup = BeautifulSoup(res, "html.parser")
# print(soup.select('.postTitle')[0])
# print(soup.select('.postText')[0])
title = soup.select('.postTitle')[0]
content = soup.select('.postText')[0]
return title, content
class User(EgStore):
def __init__(self, filename):
self.path = ''
EgStore.__init__(self, filename)
def get_file_path():
# 通过gui的方式获取文件夹路径
file_dir_flag = '1'
file_dir_flag = easygui.enterbox(msg='file(1) or dir(2):', strip=True)
# 创建存储对象
user = User("settings.txt")
# 取出以前保存的文件
user.restore()
file_or_path = None
if file_dir_flag == '2':
file_or_path = easygui.diropenbox(default=user.path)
user.path = file_or_path
user.store()
files = []
for i, j, k in os.walk(file_or_path):
for file in k:
filename = file_or_path + '\\' + file
if re.match("^[\s\S]*\.(html|mhtml|htm|txt)$", filename):
files.append(filename)
return files
else:
file_or_path = easygui.fileopenbox(multiple=True, default=user.path)
user.path = file_or_path[0]
user.store()
return file_or_path
def setDir(filepath):
'''
如果文件夹不存在就创建,如果文件存在就清空!
:param filepath:需要创建的文件夹路径
:return:
'''
if not os.path.exists(filepath):
os.mkdir(filepath)
else:
shutil.rmtree(filepath, ignore_errors=True)
os.mkdir(filepath)
def cnblog2anki(file):
res = []
with open(file, "r", encoding='utf-8') as f: # 打开文件
data = f.read() # 读取文件
soup = BeautifulSoup(data, 'html.parser')
tbody = soup.select("tbody")[0]
for tr_ele in tbody.select('tr'):
title = tr_ele.select('td:nth-child(1)>a')[0].text
url = 'http:' + tr_ele.select('td:nth-child(1)>a')[0].get('href')
res.append((title, url))
return res
def write2txt(msg):
with open(file+'.csv', "a", encoding='utf-8') as f: # 打开文件
f.writelines(msg)
if __name__ == '__main__':
res = get_file_path()
for file in res:
res = cnblog2anki(file)
for content in res:
print(content)
write2txt(f'{content[0]}\t<a href={content[1]}>{content[0]}</a>\n')
D:\code_gitee\python_get_msedge_history\新建文件夹\docs2anki.ts
import { readFileSync, appendFile } from "fs";
import { readdirSync, statSync } from "node:fs";
import { resolve, join } from "node:path";
//调用文件遍历方法
const files = getAllFilePath(resolve("./content"));
for (let file of files) {
const content = readFileSync(file, {
encoding: "utf-8",
});
console.log(file);
write2file(content, file.split("content\\").pop() ?? "");
}
function write2file(content: string, fileName: string) {
// 获取分割后的内容
const contents = splitContent(content);
if (!contents) return;
// 遍历每一句
for (let content of contents) {
// 将每一句进行编码,其实在编码的过程中也是过滤了一些内容,比如说换行符等等,目的是为了添加a标签的href
const contentEncoding = encodeContent(content);
const url = rebuildUrl(contentEncoding);
// 将内容进行解码
const contentDecoding = decodeURIComponent(contentEncoding);
// 写入的内容
const newContent = `<div>${contentDecoding}</div>\t<a href=${url}>${fileName}</a>\n`;
// 写入文件
appendFile("anki.txt", newContent, (err) => {});
}
}
function getAllFilePath(filePath: string) {
const filePaths: string[] = [];
fileDisplay(filePath);
return filePaths;
function fileDisplay(filePath: string) {
//根据文件路径读取文件,返回文件列表
const files = readdirSync(filePath);
files.forEach((fileName: string) => {
const filedir = join(filePath, fileName);
const stats = statSync(filedir);
if (stats.isFile()) {
filePaths.push(filedir);
}
if (stats.isDirectory()) {
fileDisplay(filedir);
}
});
}
}
function encodeContent(content: string) {
const newContent = encodeURIComponent(content)
.replace(/%0D%0A/g, "%5F%5F") // 过滤换行符
.replace(/%0A/g, "%5F") // 过滤换行符
.replace(/(%60){3}.*(%60){3}/g, ""); //过滤中间的代码块
return newContent;
}
function rebuildUrl(content: string) {
return `https://fanyi.sogou.com/text?transfrom=auto&transto=zh-CHS&model=general&keyword=${content}`;
}
function splitContent(content: string) {
// 将读入的文件进行分割,这里是按句子分割,分割的原则是 句号结尾加上后面的单词是首字母大写的,说明这是两个句子的交界之处
const res = content.matchAll(/[\.|!]\s*[A-Z]{1}/g);
const contents: string[] = [];
let next = res.next();
// 如果没有匹配到,则进行返回
if (!next.value) return;
// 递归遍历匹配到内容,并将每一句添加到contents中
let startIndex = 0;
let endIndex = next.value["index"] + 1;
while (!next.done) {
contents.push(content.slice(startIndex, endIndex));
startIndex = endIndex;
next = res.next();
if (next.value) {
endIndex = next.value["index"] + 1;
} else {
contents.push(content.slice(endIndex));
}
}
return contents;
}
D:\code_gitee\python_get_msedge_history\新建文件夹\get_history.py
import os
import sqlite3
# History
class History:
def __init__(self, chromePath):
self.chromePath = chromePath
def connect(self):
self.conn = sqlite3.connect(os.path.join(self.chromePath, "History"))
self.cousor = self.conn.cursor()
def close(self):
self.conn.close()
def get_history(self):
cursor = self.conn.execute(
"SELECT id,url,title from urls")
rows = []
for _id, url, title, visit_count in cursor:
row = {}
row['id'] = _id
row['url'] = url
row['title'] = title
rows.append(row)
return rows
if __name__ == "__main__":
# chrome data path
# path = "C:\\Users\\Administrator\\AppData\\Local\\Microsoft\\Edge\\User Data\\Default"
path = ".\\"
history = History(path)
history.connect()
rows = history.get_history()
f = open('.//history.txt', 'a', encoding='utf-8')
for row in rows:
f.write(row['title']+"\n")
history.close()
D:\code_gitee\python_get_msedge_history\新建文件夹\get_history.ts
import { verbose } from "sqlite3";
import { appendFileSync, write } from "fs";
type IRow = {
url: string;
title: string;
};
const sqlite3 = verbose();
const db = new sqlite3.Database("History.sqlite");
db.each(
"SELECT url,title FROM urls WHERE url LIKE '%youdao.com%' ",
(err: Error | null, row: IRow) => {
if (err) return;
handleRow(row, "youdao");
}
);
db.each(
"SELECT url,title FROM urls WHERE url LIKE '%fanyi.baidu.com%' ",
(err: Error | null, row: IRow) => {
if (err) return;
handleRow(row, "baidu");
}
);
db.close();
function handleRow(row: IRow, cidian: string) {
const rawContent = getContent(row.url, cidian);
if (!rawContent) return;
const newUrl = rebuildUrl(rawContent);
const content = decodeContent(rawContent);
write2file(content + `\t<a href=${newUrl}>${content}</a>\n`);
}
function decodeContent(url: string) {
return decodeURIComponent(url);
}
function getContent(url: string, cidian: string) {
let startPos = 0;
switch (cidian) {
case "youdao":
startPos = url.search(RegExp("word=.+lang=en", "g"));
if (startPos === -1) return;
return url.slice(startPos + 5, -8);
case "baidu":
// https://fanyi.baidu.com/?aldtype=85#en/zh/my%20GUI%20only%20shows%20the%20top%20fraction%20of%20my%20controls.
startPos = url.search(RegExp("#en/zh/.+", "g"));
if (startPos === -1) return;
return url.slice(startPos + 7);
default:
break;
}
}
function write2file(content: string) {
appendFileSync("history.txt", content, "utf8");
}
function rebuildUrl(content: string) {
return `https://fanyi.sogou.com/text?transfrom=auto&transto=zh-CHS&model=general&keyword=${content}`;
}
D:\code_gitee\python_get_msedge_history\新建文件夹\readFile.ts
import { readdirSync, readdir, stat, statSync } from "node:fs";
import { resolve, join } from "node:path";
//要遍历的文件夹所在的路径
const filePath = resolve("./content");
//调用文件遍历方法
const files = getAllFilePath(filePath);
for (let file of files) {
console.log(file.split("content\\").pop());
}
function getAllFilePath(filePath: string) {
const filePaths: string[] = [];
fileDisplay(filePath);
return filePaths;
function fileDisplay(filePath: string) {
//根据文件路径读取文件,返回文件列表
const files = readdirSync(filePath);
files.forEach((fileName: string) => {
const filedir = join(filePath, fileName);
const stats = statSync(filedir);
if (stats.isFile()) {
filePaths.push(filedir);
}
if (stats.isDirectory()) {
fileDisplay(filedir);
}
});
}
}
D:\code_gitee\python_get_msedge_history\新建文件夹\replaceCode.ts
let content = `%0A%0AIf%20you%20want%20the%20node%20application%20to%20close%20after%20the%20script%20finishes%20(e.g.%2C%20for%20a%20script%20running%20CRON%20jobs)%2C%20add%20%60await%20app.close()%60%20to%20the%20end%20of%20your%20%60bootstrap%60%20function%3A%0A%0A%60%60%60typescript%0A%40%40filename()%0Aasync%20function%20bootstrap()%20%7B%0A%20%20const%20app%20%3D%20await%20NestFactory.createApplicationContext(AppModule)%3B%0A%20%20%2F%2F%20application%20logic...%0A%20%20await%20app.close()%3B%0A%7D%0Abootstrap()%3B%0A%60%60%60%0A%0A%23%23%23%23%20Example%0A%0AA%20working%20example%20is%20available%20%5Bhere%5D(https%3A%2F%2Fgithub.com%2Fnestjs%2Fnest%2Ftree%2Fmaster%2Fsample%2F18-context).%0A`;
let newCon = content.replace(/(%60){3}.*(%60){3}/g, "");
console.log(newCon);
console.log(decodeURIComponent(newCon));
标签:const,string,chrome,docs,return,content,file,anki,path
From: https://www.cnblogs.com/zhuoss/p/17052762.html