Coder / app.py
Anonymous0045's picture
Create app.py
0a7b900 verified
import os
import multiprocessing
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import config
# ============================
# Download Model
# ============================
HF_TOKEN = os.environ.get("HF_TOKEN")
print("Downloading model from Hugging Face Hub...")
model_path = hf_hub_download(
repo_id=config.MODEL_REPO,
filename=config.MODEL_FILE,
token=HF_TOKEN,
cache_dir="/tmp/hf_cache"
)
print("Model downloaded successfully:", model_path)
# ============================
# Load Model
# ============================
CPU_THREADS = multiprocessing.cpu_count()
print("CPU Threads available:", CPU_THREADS)
print("Loading model into memory...")
llm = Llama(
model_path=model_path,
n_ctx=config.CTX_SIZE,
n_threads=CPU_THREADS,
n_batch=512,
use_mmap=True,
verbose=False
)
print("Model loaded successfully.")
# ============================
# Prompt Builder
# ============================
SYSTEM_PROMPT = """You are DeepSeek Coder, an expert programming assistant.
Write clean and efficient code.
Only explain when asked.
"""
def build_prompt(message, history):
prompt = SYSTEM_PROMPT + "\n\n"
for user_msg, assistant_msg in history:
prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
prompt += f"User: {message}\nAssistant:"
return prompt
# ============================
# Generate Response
# ============================
def chat(message, history):
history = history or []
prompt = build_prompt(message, history)
output = ""
for token in llm(
prompt,
max_tokens=config.MAX_TOKENS,
temperature=config.TEMPERATURE,
top_p=0.95,
stream=True
):
output += token["choices"][0]["text"]
yield output
# ============================
# Launch Gradio ChatInterface
# ============================
demo = gr.ChatInterface(
fn=chat,
title="DeepSeek Coder 1.3B",
description="Production GGUF model running on llama.cpp"
)
demo.launch(
server_name="0.0.0.0",
server_port=7860
)