wizardcoder-ggml / main.py
matthoffner's picture
Update main.py
65c273f
raw
history blame
3.48 kB
import fastapi
import json
import markdown
import uvicorn
from fastapi import HTTPException
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
from starlette.responses import StreamingResponse
from ctransformers import AutoModelForCausalLM
from pydantic import BaseModel
from typing import List, Dict, Any, Generator
llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
model_file="WizardCoder-15B-1.0.ggmlv3.q4_0.bin",
model_type="starcoder")
app = fastapi.FastAPI(title="🪄WizardCoder💫")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def index():
html_content = """
<html>
<head>
</head>
<body style="background-color:black">
<h2 style="font-family:system-ui"><a href="https://huggingface.co./TheBloke/WizardCoder-15B-1.0-GGML">wizardcoder-ggml</a></h2>
<iframe
src="https://matthoffner-monacopilot.hf.space"
frameborder="0"
width="95%"
height="90%"
></iframe>
<h2 style="font-family:system-ui"><a href="https://matthoffner-wizardcoder-ggml.hf.space/docs">FastAPI Docs</a></h2>
</body>
</html>
"""
return HTMLResponse(content=html_content, status_code=200)
class ChatCompletionRequestV0(BaseModel):
prompt: str
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
messages: List[Message]
max_tokens: int = 250
@app.post("/v1/completions")
async def completion(request: ChatCompletionRequestV0, response_mode=None):
response = llm(request.prompt)
return response
@app.post("/v1/chat/completions")
async def chat(request: ChatCompletionRequest):
combined_messages = ' '.join([message.content for message in request.messages])
tokens = llm.tokenize(combined_messages)
try:
chat_chunks = llm.generate(tokens)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
async def format_response(chat_chunks: Generator) -> Any:
for chat_chunk in chat_chunks:
response = {
'choices': [
{
'message': {
'role': 'system',
'content': llm.detokenize(chat_chunk)
},
'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown'
}
]
}
yield f"data: {json.dumps(response)}\n\n"
yield "event: done\ndata: {}\n\n"
return StreamingResponse(format_response(chat_chunks), media_type="text/event-stream")
@app.post("/v0/chat/completions")
async def chat(request: ChatCompletionRequestV0, response_mode=None):
tokens = llm.tokenize(request.prompt)
async def server_sent_events(chat_chunks, llm):
for chat_chunk in llm.generate(chat_chunks):
yield dict(data=json.dumps(llm.detokenize(chat_chunk)))
yield dict(data="[DONE]")
return EventSourceResponse(server_sent_events(tokens, llm))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)