# ai.py # This file handles all AI interactions, including loading/unloading models, # generating responses, and injecting personas using the Ollama API. import os import requests import re from dotenv import load_dotenv from personality import load_persona from logger import setup_logger # Set up logger specifically for AI operations logger = setup_logger("ai") # Load environment variables from .env file load_dotenv() # Base API setup from .env (e.g., http://localhost:11434/api) BASE_API = os.getenv("OLLAMA_API").rstrip("/") # Remove trailing slash just in case # API endpoints for different Ollama operations GEN_ENDPOINT = f"{BASE_API}/generate" PULL_ENDPOINT = f"{BASE_API}/pull" # UNLOAD_ENDPOINT is not used because unloading is done via `generate` with keep_alive=0 TAGS_ENDPOINT = f"{BASE_API}/tags" # Startup model and debug toggle from .env MODEL_NAME = os.getenv("MODEL_NAME", "llama3:latest") SHOW_THINKING_BLOCKS = os.getenv("SHOW_THINKING_BLOCKS", "false").lower() == "true" # Ensure API base is configured if not BASE_API: logger.error("โŒ OLLAMA_API not set.") raise ValueError("โŒ OLLAMA_API not set.") # Returns current model from env/config def get_model_name(): return MODEL_NAME # Removes ... blocks from the LLM response (used by some models) def strip_thinking_block(text: str) -> str: return re.sub(r".*?\s*", "", text, flags=re.DOTALL) # Check if a model exists locally by calling /tags def model_exists_locally(model_name: str) -> bool: try: resp = requests.get(TAGS_ENDPOINT) return model_name in resp.text except Exception as e: logger.error(f"โŒ Failed to check local models: {e}") return False # Attempt to pull (load) a model via Ollama's /pull endpoint def load_model(model_name: str) -> bool: try: logger.info(f"๐Ÿง  Preloading model: {model_name}") resp = requests.post(PULL_ENDPOINT, json={"name": model_name}) logger.info(f"๐Ÿ“จ Ollama pull response: {resp.status_code} - {resp.text}") return resp.status_code == 200 except Exception as e: logger.error(f"โŒ Exception during model load: {str(e)}") return False # Send an empty prompt to unload a model from VRAM safely using keep_alive: 0 def unload_model(model_name: str) -> bool: try: logger.info(f"๐Ÿงน Sending safe unload request for `{model_name}`") payload = { "model": model_name, "keep_alive": 0 # Tells Ollama to remove the model from memory, not disk } resp = requests.post(GEN_ENDPOINT, json=payload) logger.info(f"๐Ÿงฝ Ollama unload response: {resp.status_code} - {resp.text}") return resp.status_code == 200 except Exception as e: logger.error(f"โŒ Exception during soft-unload: {str(e)}") return False # Shortcut for getting the current model (can be expanded later for dynamic switching) def get_current_model(): return get_model_name() # Main LLM interaction โ€” injects personality and sends prompt to Ollama def get_ai_response(user_prompt): model_name = get_model_name() load_model(model_name) # Ensures the model is pulled and ready persona = load_persona() if persona: # Clean fancy quotes and build final prompt with character injection safe_inject = persona["prompt_inject"].replace("โ€œ", "\"").replace("โ€", "\"").replace("โ€™", "'") full_prompt = f"{safe_inject}\nUser: {user_prompt}\n{persona['name']}:" else: full_prompt = user_prompt # fallback to raw prompt if no persona loaded payload = { "model": model_name, # ๐Ÿ”ง Suggested fix: previously hardcoded to MODEL_NAME "prompt": full_prompt, "stream": False # optional: add "keep_alive": 300 to keep model warm } logger.info("๐Ÿ›ฐ๏ธ SENDING TO OLLAMA /generate") logger.info(f"Payload: {payload}") try: response = requests.post(GEN_ENDPOINT, json=payload) logger.info(f"๐Ÿ“จ Raw response: {response.text}") if response.status_code == 200: result = response.json() response_text = result.get("response", "[No message in response]") return strip_thinking_block(response_text) if not SHOW_THINKING_BLOCKS else response_text else: return f"[Error {response.status_code}] {response.text}" except Exception as e: return f"[Exception] {str(e)}"