10.5 OpenAI API和Langchain探索
接下来使用OpenAI API与Langchain对解析后的文档进行总结,从中提取有价值的信息。这将帮助我们更好地理解文档中的内容,包括业务情况、风险因素、财务状况分析等,并提供更简洁的概括信息。
10.5.1 OpenAI接口
编写文件openai_interface.py,实现了许多与OpenAI接口相关的功能,包括处理消息、计算令牌数量、调用OpenAI模型以及创建摘要。它还包括了一些用于检查输入令牌数量的功能。通过使用这些功能,我们可以提取并总结文档中的关键部分,以便更好地理解和分析文档。
parser = ConfigParser()
_ = parser.read(os.path.join("credentials.cfg"))
openai.api_key = parser.get("open_ai", "api_key")
INITIAL_CONTEXT_MESSAGE = {"role": "system",
"content": "Act as an assistant for security analysis. Your goal is to help make sense of "
"financial information available for US public companies on EDGAR."}
MODEL_MAX_TOKENS = {
"gpt-3.5-turbo": 4097,
"gpt-3.5-turbo-16k": 16384,
}
def get_completion(messages, model="gpt-3.5-turbo"):
return openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=0, # this is the degree of randomness of the model's output
)
def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
if model in ["gpt-3.5-turbo", "gpt-3.5-turbo-16k"]:
num_tokens = 0
for message in messages:
num_tokens += 4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
def compute_cost(tokens, model="gpt-3.5-turbo"):
if model == "gpt-3.5-turbo":
return round(tokens / 1000 * 0.002, 4)
if model == "gpt-3.5-turbo-16k":
return round(tokens / 1000 * 0.004, 4)
def get_text_tokens(value, model="gpt-3.5-turbo"):
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
return len(encoding.encode(value))
def get_messages(company_name, ticker, exchange, form, filing_date, section_title, section_text):
prompt = f"I will give you some information about the company, the form I am analysing and " \
f"then a text section of that form. All of this delimited by ^^^. " \
f"Summarize the section keeping it as short as possible, without leaving out " \
f"any information that could be relevant to an investor in the company. " \
f"If there is any reference to debt issuance write the interest rate, if present." \
f"Organize the output in a list of short information points (around 20 words each)." \
f"Remove all the points that contain duplicate information." \
f"Do not refer to exhibits." \
f"Format the output as a json with a single key 'data' and value as a list of the information points." \
f"^^^" \
f"Company Name: {company_name}" \
f"Ticker: {ticker}" \
f"Exchange: {exchange}" \
f"Form: {form}" \
f"Filing date: {filing_date}" \
f"Section title: {section_title}" \
f"Section text: {section_text}" \
f"^^^"
messages = [
INITIAL_CONTEXT_MESSAGE,
{"role": "user", "content": prompt},
]
return messages
def create_summary(section_text, model, chain_type="map_reduce", verbose=False):
llm = ChatOpenAI(model_name=model, openai_api_key=parser.get("open_ai", "api_key"))
string_loader = UnstructuredStringLoader(section_text)
docs = split_doc_in_chunks(string_loader.load())
chain = load_summarize_chain(llm, chain_type=chain_type, verbose=verbose)
with get_openai_callback() as cb:
res = chain.run(docs)
return res, cb.total_tokens
def summarize_section(section_text, model="gpt-3.5-turbo", chain_type="map_reduce", verbose=False):
summary, tokens = create_summary(section_text, model, chain_type, verbose)
bullets = [x.strip() for x in re.split(r'(?<!inc)(?<!Inc)\. ', summary)]
cost = compute_cost(tokens, model=model)
return bullets, cost
def check_input_tokens(input_tokens, model):
return input_tokens > MODEL_MAX_TOKENS[model] - 500