File size: 4,606 Bytes
a57f72f 7fdf7e0 2b6fd3b a57f72f 2b6fd3b a57f72f bf31376 463d78b a57f72f 2b6fd3b 3c54391 0247d00 341885c 08d233d 210500b a57f72f 82c09a3 0b2af73 b11f460 82c09a3 a57f72f 46ac909 a57f72f 0d49ac1 46ac909 0d49ac1 4efb547 0d49ac1 7982958 65c273f 7982958 c796178 8abd12f c796178 7904b82 c796178 210500b 94d3ebe 4f37acf 0d49ac1 ebbbded 0d49ac1 e177246 0d49ac1 865b816 3d19e0f 3afb5fe 3d19e0f 9e9fcb0 2b6fd3b 865b816 9e9fcb0 d7501b1 9e9fcb0 2b6fd3b d7501b1 210500b 46ac909 f7d8687 3c54391 3650102 40717b9 a57f72f 3c54391 a57f72f 865b816 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import json
import markdown
from typing import Callable, List, Dict, Any, Generator
from functools import partial
import fastapi
import uvicorn
from fastapi import HTTPException, Depends, Request
from fastapi.responses import HTMLResponse, StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
from anyio import create_memory_object_stream
from anyio.to_thread import run_sync
from ctransformers import AutoModelForCausalLM
from pydantic import BaseModel
llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
model_file="WizardCoder-15B-1.0.ggmlv3.q5_0.bin",
model_type="starcoder",
threads=8)
app = fastapi.FastAPI(title="🪄WizardCoder💫")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def index():
html_content = """
<html>
<head>
</head>
<body style="font-family:system-ui">
<h2><a href="https://huggingface.co./TheBloke/WizardCoder-15B-1.0-GGML">wizardcoder-ggml</a></h2>
<h2><a href="https://matthoffner-wizardcoder-ggml.hf.space/docs">FastAPI Docs</a></h2>
<h2><a href="https://wizardcoder-sandbox.netlify.app">Wizardcoder Sandbox</a></h2>
<h2><a href="https://matthoffner-monacopilot.hf.space">monacopilot</a></h2>
</body>
</html>
"""
return HTMLResponse(content=html_content, status_code=200)
class ChatCompletionRequestV0(BaseModel):
prompt: str
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
messages: List[Message]
max_tokens: int = 250
@app.post("/v1/completions")
async def completion(request: ChatCompletionRequestV0, response_mode=None):
response = llm(request.prompt)
return response
async def generate_response(chat_chunks, llm):
for chat_chunk in chat_chunks:
response = {
'choices': [
{
'message': {
'role': 'system',
'content': llm.detokenize(chat_chunk)
},
'finish_reason': 'stop' if chat_chunk == "<|end|>" else 'unknown'
}
]
}
yield f"data: {json.dumps(response)}\n\n"
yield dict(data="[DONE]")
@app.post("/v1/chat/completions")
async def chat(request: ChatCompletionRequest):
combined_messages = ' '.join([message.content for message in request.messages])
tokens = llm.tokenize(combined_messages)
try:
chat_chunks = llm.generate(tokens)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
return StreamingResponse(generate_response(chat_chunks, llm), media_type="text/event-stream")
async def stream_response(tokens: Any) -> None:
try:
iterator: Generator = llm.generate(tokens)
for chat_chunk in iterator:
response = {
'choices': [
{
'message': {
'role': 'system',
'content': llm.detokenize(chat_chunk)
},
'finish_reason': 'stop' if llm.is_eos_token(chat_chunk) else 'unknown'
}
]
}
yield f"data: {json.dumps(response)}\n\n"
yield b"event: done\ndata: {}\n\n"
except Exception as e:
print(f"Exception in event publisher: {str(e)}")
async def chatV2(request: Request, body: ChatCompletionRequest):
combined_messages = ' '.join([message.content for message in body.messages])
tokens = llm.tokenize(combined_messages)
return StreamingResponse(stream_response(tokens))
@app.post("/v2/chat/completions")
async def chatV2_endpoint(request: Request, body: ChatCompletionRequest):
return await chatV2(request, body)
@app.post("/v0/chat/completions")
async def chat(request: ChatCompletionRequestV0, response_mode=None):
tokens = llm.tokenize(request.prompt)
async def server_sent_events(chat_chunks, llm):
for chat_chunk in llm.generate(chat_chunks):
yield dict(data=json.dumps(llm.detokenize(chat_chunk)))
yield dict(data="[DONE]")
return EventSourceResponse(server_sent_events(tokens, llm))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
|