File size: 3,510 Bytes
a57f72f 0d49ac1 210500b a57f72f 94d3ebe 3c54391 a57f72f 94d3ebe 0d49ac1 a57f72f 3c54391 210500b a57f72f 82c09a3 66c9b7e 82c09a3 66c9b7e 82c09a3 66c9b7e 82c09a3 a57f72f 46ac909 a57f72f 0d49ac1 46ac909 0d49ac1 4efb547 0d49ac1 7982958 210500b 94d3ebe 4f37acf 0d49ac1 4efb547 0d49ac1 94d3ebe 0d49ac1 94d3ebe 0d49ac1 94d3ebe 0d49ac1 210500b 46ac909 f7d8687 3c54391 3650102 40717b9 a57f72f 3c54391 a57f72f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import fastapi
import json
import markdown
import uvicorn
from fastapi import HTTPException
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
from starlette.responses import StreamingResponse
from ctransformers import AutoModelForCausalLM
from pydantic import BaseModel
from typing import List, Dict, Any, Generator
llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
model_file="WizardCoder-15B-1.0.ggmlv3.q4_0.bin",
model_type="starcoder")
app = fastapi.FastAPI(title="🪄WizardCoder💫")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def index():
html_content = """
<html>
<head>
</head>
<body style="background-color:black">
<h2 style="font-family:system-ui"><a href="https://huggingface.co./TheBloke/WizardCoder-15B-1.0-GGML">wizardcoder-ggml</a></h2>
<iframe
src="https://matthoffner-monacopilot.hf.space"
frameborder="0"
width="95%"
height="90%"
></iframe>
<h2 style="font-family:system-ui"><a href="https://matthoffner-wizardcoder-ggml.hf.space/docs">FastAPI Docs</a></h2>
</body>
</html>
"""
return HTMLResponse(content=html_content, status_code=200)
class ChatCompletionRequestV0(BaseModel):
prompt: str
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
messages: List[Message]
max_tokens: int = 250
@app.post("/v1/completions")
async def completion(request: ChatCompletionRequest, response_mode=None):
response = llm(request.prompt)
return response
@app.post("/v1/chat/completions")
async def chat(request: ChatCompletionRequest):
combined_messages = ' '.join([message.content for message in request.messages])
tokens = llm.tokenize(combined_messages)
try:
chat_chunks = llm.generate(tokens, max_new_tokens=request.max_tokens)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
async def format_response(chat_chunks: Generator) -> Any:
for chat_chunk in chat_chunks:
response = {
'choices': [
{
'message': {
'role': 'system',
'content': llm.detokenize(chat_chunk)
},
'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown'
}
]
}
yield f"data: {json.dumps(response)}\n\n"
yield "event: done\ndata: {}\n\n"
return StreamingResponse(format_response(chat_chunks), media_type="text/event-stream")
@app.post("/v0/chat/completions")
async def chat(request: ChatCompletionRequestV0, response_mode=None):
tokens = llm.tokenize(request.prompt)
async def server_sent_events(chat_chunks, llm):
for chat_chunk in llm.generate(chat_chunks):
yield dict(data=json.dumps(llm.detokenize(chat_chunk)))
yield dict(data="[DONE]")
return EventSourceResponse(server_sent_events(tokens, llm))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000) |