File size: 3,207 Bytes
a57f72f
 
 
 
0d49ac1
210500b
a57f72f
 
3c54391
a57f72f
0d49ac1
 
a57f72f
3c54391
 
 
210500b
a57f72f
 
 
 
 
 
 
 
 
 
82c09a3
 
 
 
66c9b7e
 
82c09a3
 
 
66c9b7e
 
82c09a3
66c9b7e
82c09a3
 
 
a57f72f
 
 
 
 
0d49ac1
 
 
 
 
 
 
 
7982958
 
 
 
 
210500b
0d49ac1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210500b
a57f72f
f7d8687
3c54391
 
3650102
40717b9
a57f72f
3c54391
a57f72f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import fastapi
import json
import markdown
import uvicorn
from fastapi import HTTPException
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
from ctransformers import AutoModelForCausalLM
from pydantic import BaseModel
from typing import List, Dict, Any


llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
                                           model_file="WizardCoder-15B-1.0.ggmlv3.q4_0.bin",
                                           model_type="starcoder")
app = fastapi.FastAPI(title="🪄WizardCoder💫")
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.get("/")
async def index():
    html_content = """
    <html>
        <head>
        </head>
        <body style="background-color:black">
            <h2 style="font-family:system-ui"><a href="https://huggingface.co./TheBloke/WizardCoder-15B-1.0-GGML">wizardcoder-ggml</a></h2>
            <iframe
                src="https://matthoffner-monacopilot.hf.space"
                frameborder="0"
                width="95%"
                height="90%"
            ></iframe>
            <h2 style="font-family:system-ui"><a href="https://matthoffner-wizardcoder-ggml.hf.space/docs">FastAPI Docs</a></h2>
        </body>
    </html>
    """
    return HTMLResponse(content=html_content, status_code=200)

class ChatCompletionRequest(BaseModel):
    prompt: str

class Message(BaseModel):
    role: str
    content: str

class ChatCompletionRequestV2(BaseModel):
    messages: List[Message]
    max_tokens: int = 100

@app.post("/v1/completions")
async def completion(request: ChatCompletionRequest, response_mode=None):
    response = llm(request.prompt)
    return response

@app.post("/v1/chat/completions")
async def chat(request: ChatCompletionRequestV2):
    tokens = llm.tokenize([message.content for message in request.messages])
    
    try:
        chat_chunks = llm.generate(tokens, max_tokens=request.max_tokens)
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

    def format_response(chat_chunks) -> Dict[str, Any]:
        response = {
            'choices': []
        }
        for chat_chunk in chat_chunks:
            response['choices'].append({
                'message': {
                    'role': 'system',
                    'content': llm.detokenize(chat_chunk)
                },
                'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown'
            })
        return response

    return format_response(chat_chunks)

@app.post("/v0/chat/completions")
async def chat(request: ChatCompletionRequest, response_mode=None):
    tokens = llm.tokenize(request.prompt)
    async def server_sent_events(chat_chunks, llm):
        for chat_chunk in llm.generate(chat_chunks):
            yield dict(data=json.dumps(llm.detokenize(chat_chunk)))
        yield dict(data="[DONE]")

    return EventSourceResponse(server_sent_events(tokens, llm))

if __name__ == "__main__":
  uvicorn.run(app, host="0.0.0.0", port=8000)