首页 > 其他分享 >chrome_history_and_docs_2_anki

chrome_history_and_docs_2_anki

时间:2023-01-14 23:11:31浏览次数:37  
标签:const string chrome docs return content file anki path

D:\code_gitee\python_get_msedge_history\新建文件夹\cnblog2anki.py

from base64 import encode
import os
import re
import shutil
import easygui
import requests
from subprocess import run
from easygui import *
from bs4 import BeautifulSoup


def get_html_content(link):
    # link = 'https://www.cnblogs.com/zhuoss/p/16909370.html'
    # 请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
    }

    res = requests.get(url=link, headers=headers).text
    soup = BeautifulSoup(res, "html.parser")
    # print(soup.select('.postTitle')[0])
    # print(soup.select('.postText')[0])
    title = soup.select('.postTitle')[0]
    content = soup.select('.postText')[0]
    return title, content


class User(EgStore):
    def __init__(self, filename):
        self.path = ''
        EgStore.__init__(self, filename)


def get_file_path():

    # 通过gui的方式获取文件夹路径
    file_dir_flag = '1'
    file_dir_flag = easygui.enterbox(msg='file(1) or dir(2):', strip=True)

    # 创建存储对象
    user = User("settings.txt")
    # 取出以前保存的文件
    user.restore()
    file_or_path = None
    if file_dir_flag == '2':

        file_or_path = easygui.diropenbox(default=user.path)
        user.path = file_or_path
        user.store()
        files = []
        for i, j, k in os.walk(file_or_path):
            for file in k:
                filename = file_or_path + '\\' + file
                if re.match("^[\s\S]*\.(html|mhtml|htm|txt)$", filename):
                    files.append(filename)
        return files
    else:
        file_or_path = easygui.fileopenbox(multiple=True, default=user.path)
        user.path = file_or_path[0]
        user.store()
        return file_or_path


def setDir(filepath):
    '''
    如果文件夹不存在就创建,如果文件存在就清空!
    :param filepath:需要创建的文件夹路径
    :return:
    '''
    if not os.path.exists(filepath):
        os.mkdir(filepath)
    else:
        shutil.rmtree(filepath, ignore_errors=True)
        os.mkdir(filepath)


def cnblog2anki(file):
    res = []
    with open(file, "r", encoding='utf-8') as f:  # 打开文件
        data = f.read()  # 读取文件
        soup = BeautifulSoup(data, 'html.parser')
        tbody = soup.select("tbody")[0]
        for tr_ele in tbody.select('tr'):
            title = tr_ele.select('td:nth-child(1)>a')[0].text
            url = 'http:' + tr_ele.select('td:nth-child(1)>a')[0].get('href')
            res.append((title, url))
    return res


def write2txt(msg):
    with open(file+'.csv', "a", encoding='utf-8') as f:  # 打开文件
        f.writelines(msg)


if __name__ == '__main__':
    res = get_file_path()
    for file in res:
        res = cnblog2anki(file)
        for content in res:
            print(content)
            write2txt(f'{content[0]}\t<a href={content[1]}>{content[0]}</a>\n')

D:\code_gitee\python_get_msedge_history\新建文件夹\docs2anki.ts

import { readFileSync, appendFile } from "fs";
import { readdirSync, statSync } from "node:fs";
import { resolve, join } from "node:path";

//调用文件遍历方法
const files = getAllFilePath(resolve("./content"));

for (let file of files) {
  const content = readFileSync(file, {
    encoding: "utf-8",
  });
  console.log(file);
  write2file(content, file.split("content\\").pop() ?? "");
}

function write2file(content: string, fileName: string) {
  // 获取分割后的内容
  const contents = splitContent(content);
  if (!contents) return;
  // 遍历每一句
  for (let content of contents) {
    // 将每一句进行编码,其实在编码的过程中也是过滤了一些内容,比如说换行符等等,目的是为了添加a标签的href
    const contentEncoding = encodeContent(content);
    const url = rebuildUrl(contentEncoding);
    // 将内容进行解码
    const contentDecoding = decodeURIComponent(contentEncoding);
    // 写入的内容
    const newContent = `<div>${contentDecoding}</div>\t<a href=${url}>${fileName}</a>\n`;
    // 写入文件
    appendFile("anki.txt", newContent, (err) => {});
  }
}

function getAllFilePath(filePath: string) {
  const filePaths: string[] = [];
  fileDisplay(filePath);
  return filePaths;
  function fileDisplay(filePath: string) {
    //根据文件路径读取文件,返回文件列表
    const files = readdirSync(filePath);
    files.forEach((fileName: string) => {
      const filedir = join(filePath, fileName);
      const stats = statSync(filedir);
      if (stats.isFile()) {
        filePaths.push(filedir);
      }
      if (stats.isDirectory()) {
        fileDisplay(filedir);
      }
    });
  }
}

function encodeContent(content: string) {
  const newContent = encodeURIComponent(content)
    .replace(/%0D%0A/g, "%5F%5F") // 过滤换行符
    .replace(/%0A/g, "%5F") // 过滤换行符
    .replace(/(%60){3}.*(%60){3}/g, ""); //过滤中间的代码块
  return newContent;
}

function rebuildUrl(content: string) {
  return `https://fanyi.sogou.com/text?transfrom=auto&transto=zh-CHS&model=general&keyword=${content}`;
}

function splitContent(content: string) {
  // 将读入的文件进行分割,这里是按句子分割,分割的原则是 句号结尾加上后面的单词是首字母大写的,说明这是两个句子的交界之处
  const res = content.matchAll(/[\.|!]\s*[A-Z]{1}/g);
  const contents: string[] = [];
  let next = res.next();
  // 如果没有匹配到,则进行返回
  if (!next.value) return;
  // 递归遍历匹配到内容,并将每一句添加到contents中
  let startIndex = 0;
  let endIndex = next.value["index"] + 1;
  while (!next.done) {
    contents.push(content.slice(startIndex, endIndex));
    startIndex = endIndex;
    next = res.next();

    if (next.value) {
      endIndex = next.value["index"] + 1;
    } else {
      contents.push(content.slice(endIndex));
    }
  }
  return contents;
}

D:\code_gitee\python_get_msedge_history\新建文件夹\get_history.py

import os
import sqlite3


# History
class History:
    def __init__(self, chromePath):
        self.chromePath = chromePath

    def connect(self):
        self.conn = sqlite3.connect(os.path.join(self.chromePath, "History"))
        self.cousor = self.conn.cursor()

    def close(self):
        self.conn.close()

    def get_history(self):
        cursor = self.conn.execute(
            "SELECT id,url,title  from urls")
        rows = []
        for _id, url, title, visit_count in cursor:
            row = {}
            row['id'] = _id
            row['url'] = url
            row['title'] = title
            rows.append(row)
        return rows


if __name__ == "__main__":
    # chrome data path
    # path = "C:\\Users\\Administrator\\AppData\\Local\\Microsoft\\Edge\\User Data\\Default"
    path = ".\\"
    history = History(path)
    history.connect()
    rows = history.get_history()
    f = open('.//history.txt', 'a', encoding='utf-8')
    for row in rows:
        f.write(row['title']+"\n")

    history.close()

D:\code_gitee\python_get_msedge_history\新建文件夹\get_history.ts

import { verbose } from "sqlite3";
import { appendFileSync, write } from "fs";

type IRow = {
  url: string;
  title: string;
};

const sqlite3 = verbose();
const db = new sqlite3.Database("History.sqlite");

db.each(
  "SELECT url,title FROM urls WHERE url LIKE '%youdao.com%' ",
  (err: Error | null, row: IRow) => {
    if (err) return;
    handleRow(row, "youdao");
  }
);

db.each(
  "SELECT url,title FROM urls WHERE url LIKE '%fanyi.baidu.com%' ",
  (err: Error | null, row: IRow) => {
    if (err) return;
    handleRow(row, "baidu");
  }
);

db.close();
function handleRow(row: IRow, cidian: string) {
  const rawContent = getContent(row.url, cidian);
  if (!rawContent) return;
  const newUrl = rebuildUrl(rawContent);
  const content = decodeContent(rawContent);
  write2file(content + `\t<a href=${newUrl}>${content}</a>\n`);
}

function decodeContent(url: string) {
  return decodeURIComponent(url);
}

function getContent(url: string, cidian: string) {
  let startPos = 0;
  switch (cidian) {
    case "youdao":
      startPos = url.search(RegExp("word=.+lang=en", "g"));
      if (startPos === -1) return;
      return url.slice(startPos + 5, -8);
    case "baidu":
      // https://fanyi.baidu.com/?aldtype=85#en/zh/my%20GUI%20only%20shows%20the%20top%20fraction%20of%20my%20controls.
      startPos = url.search(RegExp("#en/zh/.+", "g"));
      if (startPos === -1) return;
      return url.slice(startPos + 7);
    default:
      break;
  }
}

function write2file(content: string) {
  appendFileSync("history.txt", content, "utf8");
}

function rebuildUrl(content: string) {
  return `https://fanyi.sogou.com/text?transfrom=auto&transto=zh-CHS&model=general&keyword=${content}`;
}

D:\code_gitee\python_get_msedge_history\新建文件夹\readFile.ts

import { readdirSync, readdir, stat, statSync } from "node:fs";
import { resolve, join } from "node:path";

//要遍历的文件夹所在的路径
const filePath = resolve("./content");

//调用文件遍历方法
const files = getAllFilePath(filePath);

for (let file of files) {
  console.log(file.split("content\\").pop());
}

function getAllFilePath(filePath: string) {
  const filePaths: string[] = [];
  fileDisplay(filePath);
  return filePaths;
  function fileDisplay(filePath: string) {
    //根据文件路径读取文件,返回文件列表
    const files = readdirSync(filePath);
    files.forEach((fileName: string) => {
      const filedir = join(filePath, fileName);
      const stats = statSync(filedir);
      if (stats.isFile()) {
        filePaths.push(filedir);
      }
      if (stats.isDirectory()) {
        fileDisplay(filedir);
      }
    });
  }
}

D:\code_gitee\python_get_msedge_history\新建文件夹\replaceCode.ts

let content = `%0A%0AIf%20you%20want%20the%20node%20application%20to%20close%20after%20the%20script%20finishes%20(e.g.%2C%20for%20a%20script%20running%20CRON%20jobs)%2C%20add%20%60await%20app.close()%60%20to%20the%20end%20of%20your%20%60bootstrap%60%20function%3A%0A%0A%60%60%60typescript%0A%40%40filename()%0Aasync%20function%20bootstrap()%20%7B%0A%20%20const%20app%20%3D%20await%20NestFactory.createApplicationContext(AppModule)%3B%0A%20%20%2F%2F%20application%20logic...%0A%20%20await%20app.close()%3B%0A%7D%0Abootstrap()%3B%0A%60%60%60%0A%0A%23%23%23%23%20Example%0A%0AA%20working%20example%20is%20available%20%5Bhere%5D(https%3A%2F%2Fgithub.com%2Fnestjs%2Fnest%2Ftree%2Fmaster%2Fsample%2F18-context).%0A`;

let newCon = content.replace(/(%60){3}.*(%60){3}/g, "");

console.log(newCon);

console.log(decodeURIComponent(newCon));

标签:const,string,chrome,docs,return,content,file,anki,path
From: https://www.cnblogs.com/zhuoss/p/17052762.html

相关文章

  • .Net6在Docker环境下操作Selenium.Chrome的那些坑
    .Net6中想实现对某个网址截屏,可通过Selenium模拟访问网址并实现截图。实现安装Nuget包<PackageReferenceInclude="Selenium.Chrome.WebDriver"Version="85.0.0"/><......
  • el-button在chrome低版本(<88)中显示异常的问题
    1、问题内容:el-button按钮显示灰色背景色和边框。【浏览器版本:chrome75】 2、问题原因:el-button使用的:not(xxx,xxx,xxx),以逗号分隔的选择器列表作为参数是实验性的,尚......
  • chrome-headless
    window命令使用找打chrome安装的位置,默认在C:\ProgramFiles\Google\Chrome\Application然后无头启动./chrome.exe--headless--remote-debugging-port=9222https......
  • chrome 监听touch类事件报错:无法被动侦听事件preventDefault
    解决方法://判断默认行为是否可以被禁用if(evt.cancelable){//判断默认行为是否已经被禁用if(!evt.defaultPrevented){evt.preventDe......
  • linux一次安装chromedrive记录
    先查看已安装的chrome版本[root@iZ8vbeixmmd1ntxae9oe19Z~]#google-chrome--versionGoogleChrome109.0.5414.74[root@iZ8vbeixmmd1ntxae9oe19Z~]#没有安装需......
  • nn.MarginRankingLoss介绍
    nn.MarginRankingLoss复现论文代码中,它使用了MarginRankingLoss()函数,以下是我百度的内容:排序损失函数对于包含\(\mathbf{N}\)个样本的batch数据\(D(x_1,x_2,y)\),\(x......
  • 在Chrome中安装扩展程序
    场景:在Chrome中安装NetBeansConnector插件,将下载好的crx文件拖到扩展程序页面时,发现该插件并没有安装成功。分析:浏览器默认禁用了拖入安装.crx扩展的功能,导致crx文件......
  • MacOS无法打开“chromedriver”,因为无法验证开发者
    最近在学习webUI自动化,需要使用谷歌浏览器的驱动,下载后加入环境变量运行chromedriver报错:无法打开“chromedriver”,因为无法验证开发者解决方案:打开终端导航至chromedri......
  • chrome究极暗夜操作
    有时候由黑色背景切换到白色背景会莫名的刺眼,尤其是晚上的时候,这种场景主要体现在由编辑器到百度(面相cv编程)所以拥有一个全黑的浏览器,应该是非常炫酷的,之前尝试过不同......
  • readthedocs | 为工具撰写使用文档
     之前有教程:bookdown-撰写和发表自己的网络书籍/文档 现在发现readthedocs可能更简单,更适合页数不多的文档,可以适配JupyternotebooksinSphinx,分析做好,注释好,文档......