File size: 3,207 Bytes
a57f72f 0d49ac1 210500b a57f72f 3c54391 a57f72f 0d49ac1 a57f72f 3c54391 210500b a57f72f 82c09a3 66c9b7e 82c09a3 66c9b7e 82c09a3 66c9b7e 82c09a3 a57f72f 0d49ac1 7982958 210500b 0d49ac1 210500b a57f72f f7d8687 3c54391 3650102 40717b9 a57f72f 3c54391 a57f72f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import fastapi
import json
import markdown
import uvicorn
from fastapi import HTTPException
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
from ctransformers import AutoModelForCausalLM
from pydantic import BaseModel
from typing import List, Dict, Any
llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
model_file="WizardCoder-15B-1.0.ggmlv3.q4_0.bin",
model_type="starcoder")
app = fastapi.FastAPI(title="🪄WizardCoder💫")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def index():
html_content = """
<html>
<head>
</head>
<body style="background-color:black">
<h2 style="font-family:system-ui"><a href="https://huggingface.co./TheBloke/WizardCoder-15B-1.0-GGML">wizardcoder-ggml</a></h2>
<iframe
src="https://matthoffner-monacopilot.hf.space"
frameborder="0"
width="95%"
height="90%"
></iframe>
<h2 style="font-family:system-ui"><a href="https://matthoffner-wizardcoder-ggml.hf.space/docs">FastAPI Docs</a></h2>
</body>
</html>
"""
return HTMLResponse(content=html_content, status_code=200)
class ChatCompletionRequest(BaseModel):
prompt: str
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequestV2(BaseModel):
messages: List[Message]
max_tokens: int = 100
@app.post("/v1/completions")
async def completion(request: ChatCompletionRequest, response_mode=None):
response = llm(request.prompt)
return response
@app.post("/v1/chat/completions")
async def chat(request: ChatCompletionRequestV2):
tokens = llm.tokenize([message.content for message in request.messages])
try:
chat_chunks = llm.generate(tokens, max_tokens=request.max_tokens)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
def format_response(chat_chunks) -> Dict[str, Any]:
response = {
'choices': []
}
for chat_chunk in chat_chunks:
response['choices'].append({
'message': {
'role': 'system',
'content': llm.detokenize(chat_chunk)
},
'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown'
})
return response
return format_response(chat_chunks)
@app.post("/v0/chat/completions")
async def chat(request: ChatCompletionRequest, response_mode=None):
tokens = llm.tokenize(request.prompt)
async def server_sent_events(chat_chunks, llm):
for chat_chunk in llm.generate(chat_chunks):
yield dict(data=json.dumps(llm.detokenize(chat_chunk)))
yield dict(data="[DONE]")
return EventSourceResponse(server_sent_events(tokens, llm))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000) |