import logging from io import BytesIO import tos from urllib.parse import quote_plus import pymongo # 设置日志格式 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # MongoDB 连接配置 # mongo_uri = "mongodb://localhost:27017" db_name = "spider" collection_name = "analyzed_books" # TOS 连接配置 ak = 'AKLTMjI1MGJmMjAwN2Q1NGQyN2EzODM3NjFiMTcyNTgzNzU' sk = 'WWpkaFpEUXhZek15WWpBeU5EWXhOemt5WWpFeU9XRTNNVFl6TXpGaVptUQ==' endpoint = "tos-cn-beijing.volces.com" region = "cn-beijing" bucket_name = "livein-origin-data" # 连接到 MongoDB client = pymongo.MongoClient(mongo_uri) db = client[db_name] collection = db[collection_name] # 获取所有文档 results = list(collection.find()) print(f'analyzed_books需要修改的数据数量:{len(list(results))}') for doc in results: print(doc) book_id = doc['_id'] ocr_content = doc['ocr_content'] # 创建一个 BytesIO 对象,用于模拟文件内容 content = BytesIO(ocr_content.encode('utf-8')) # 生成 TOS 对象的键 object_key = f"books/{book_id}.md" # 创建 TOS 客户端 client = tos.TosClientV2(ak, sk, endpoint, region) try: # 上传到 TOS client.put_object(bucket_name, object_key, content=content) # 生成 TOS 对象的公共链接 tos_url = f"https://{bucket_name}.{endpoint}/{object_key}" # 更新 MongoDB 文档,将 ocr_content 替换为 TOS 链接 collection.update_one({'_id': book_id}, {'$set': {'ocr_content': tos_url}}) logging.info(f"更新后的数据: {tos_url}\n") except Exception as e: logging.error(f"更新失败: {e}")
标签:某字,tos,替换成,content,TOS,import,ocr,name From: https://www.cnblogs.com/wolvies/p/18674550