示例为调用huggingface的大模型,使其流式输出
from fastapi import FastAPI, Request import requests import json import os from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, TextIteratorStreamer from sse_starlette.sse import EventSourceResponse from threading import Thread import asyncio import os os.environ["TOKENIZERS_PARALLELISM"] = "false" app = FastAPI() class Model: model = None tokenizer = None class Message(BaseModel): messages: str # 程序启动时 加载模型 @app.on_event("startup") async def startup_event():
# 模型路径 model_id = "/data1/songxiaoyong/model_70b" # model_id = "/data1/songxiaoyong/model_8b_origin" Model.tokenizer = AutoTokenizer.from_pretrained(model_id) Model.model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype="auto", device_map="auto" ) @app.get("/") async def index(): """ 注册一个根路径 :return: """ return {"message": "Welcome"} @app.post("/chat") async def chat(request: Request): """ 项目信息 :return: """ model = Model.model tokenizer = Model.tokenizer messages=await request.json() messages=messages["messages"] print(messages) # messages = [ # # {"role": "user", "content": "写一篇100字的作文"}, # ] input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=False ) encoding = tokenizer(input_ids, return_tensors="pt") # streamer = TextStreamer(tokenizer) streamer = TextIteratorStreamer(tokenizer) encoding = encoding.to('cuda') generation_kwargs = dict(encoding, streamer=streamer, max_new_tokens=8192, do_sample=True, temperature=0.8, top_p=0.9) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # generated_text = "" async def event_generator(streamer): for index, new_text in enumerate(streamer): if index == 0: continue print(new_text) if "<|eot_id|>" in str(new_text): new_text = str(new_text).replace("<|eot_id|>", "")
# 处理换行符 if "\n" in str(new_text): new_text=str(new_text).replace("\n", "@@@@") # yield json.dumps({ # "event": "message", # "data": new_text # },ensure_ascii=False) yield new_text await asyncio.sleep(0.1) event_source = EventSourceResponse(event_generator(streamer))
# 把发送Ping信息的间隔设置长一点 event_source.ping_interval = 60000 return event_source
nohup uvicorn erver:app --host '0.0.0.0' --port 8000 --reload > server.logs 2>&1 &
标签:tokenizer,fastapi,流式,import,text,sse,new,model,event From: https://www.cnblogs.com/sxy-blog/p/18245267