import os import multiprocessing import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import config # ============================ # Download Model # ============================ HF_TOKEN = os.environ.get("HF_TOKEN") print("Downloading model from Hugging Face Hub...") model_path = hf_hub_download( repo_id=config.MODEL_REPO, filename=config.MODEL_FILE, token=HF_TOKEN, cache_dir="/tmp/hf_cache" ) print("Model downloaded successfully:", model_path) # ============================ # Load Model # ============================ CPU_THREADS = multiprocessing.cpu_count() print("CPU Threads available:", CPU_THREADS) print("Loading model into memory...") llm = Llama( model_path=model_path, n_ctx=config.CTX_SIZE, n_threads=CPU_THREADS, n_batch=512, use_mmap=True, verbose=False ) print("Model loaded successfully.") # ============================ # Prompt Builder # ============================ SYSTEM_PROMPT = """You are DeepSeek Coder, an expert programming assistant. Write clean and efficient code. Only explain when asked. """ def build_prompt(message, history): prompt = SYSTEM_PROMPT + "\n\n" for user_msg, assistant_msg in history: prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n" prompt += f"User: {message}\nAssistant:" return prompt # ============================ # Generate Response # ============================ def chat(message, history): history = history or [] prompt = build_prompt(message, history) output = "" for token in llm( prompt, max_tokens=config.MAX_TOKENS, temperature=config.TEMPERATURE, top_p=0.95, stream=True ): output += token["choices"][0]["text"] yield output # ============================ # Launch Gradio ChatInterface # ============================ demo = gr.ChatInterface( fn=chat, title="DeepSeek Coder 1.3B", description="Production GGUF model running on llama.cpp" ) demo.launch( server_name="0.0.0.0", server_port=7860 )