10.5.2 Langchain和OpenAI处理
编写文件summarizer.py,定义了使用 Langchain 和 OpenAI 处理文本和文档的功能处。其中,类UnstructuredStringLoader用于加载未结构化的字符串内容,将其分割成适当大小的块。方法split_text_in_chunks 和方法 split_doc_in_chunks分别用于将文本和文档分割成块。最后,doc_summary 函数用于生成文档摘要和预览。
from typing import Any, List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.unstructured import UnstructuredBaseLoader
class UnstructuredStringLoader(UnstructuredBaseLoader):
def __init__(
self, content: str, source: str = None, mode: str = "single",
**unstructured_kwargs: Any
):
self.content = content
self.source = source
super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.text import partition_text
return partition_text(text=self.content, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
return {"source": self.source} if self.source else {}
def split_text_in_chunks(text, chunk_size=20000):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=100)
chunks = text_splitter.split_text(text)
return chunks
def split_doc_in_chunks(doc, chunk_size=20000):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=100)
chunks = text_splitter.split_documents(doc)
return chunks
def doc_summary(docs):
print(f'You have {len(docs)} document(s)')
num_words = sum([len(doc.page_content.split(' ')) for doc in docs])
print(f'You have roughly {num_words} words in your docs')
print()
print(f'Preview: \n{docs[0].page_content.split(". ")[0]}')
标签:02,text,self,Langchain,OpenAI,split,doc,chunks,chunk
From: https://blog.csdn.net/asd343442/article/details/139755047