Merge pull request 'Model-loading-test' (#28) from Model-loading-test into main

Reviewed-on: #28
This commit is contained in:
milo 2025-05-13 22:49:00 -04:00
commit ea206a1d7f
6 changed files with 551 additions and 62 deletions

5
.env
View file

@ -1,4 +1,5 @@
DISCORD_TOKEN=MTM2OTc3NDY4OTYzNDg4MTU4Ng.G9Nrgz.akHoOO9SrXCDwiOCI3BUXfdR4bpSNb9zrVx9UI DISCORD_TOKEN=MTM2OTc3NDY4OTYzNDg4MTU4Ng.G9Nrgz.akHoOO9SrXCDwiOCI3BUXfdR4bpSNb9zrVx9UI
OLLAMA_API=http://192.168.1.100:11434/api/generate OLLAMA_API=http://192.168.1.100:11434/api/
MODEL_NAME=deepseek-r1:8b MODEL_NAME=gemma3:12b
CHANNEL_ID=1370420592360161393 CHANNEL_ID=1370420592360161393
SHOW_THINKING_BLOCKS=false

378
bot.log

File diff suppressed because one or more lines are too long

Binary file not shown.

View file

@ -1,51 +1,117 @@
# ai.py # ai.py
# This file handles all AI interactions, including loading/unloading models,
# generating responses, and injecting personas using the Ollama API.
import requests
import os import os
import requests
import re
from dotenv import load_dotenv from dotenv import load_dotenv
from personality import load_persona from personality import load_persona
from logger import setup_logger from logger import setup_logger
# Set up logger specifically for AI operations
logger = setup_logger("ai") logger = setup_logger("ai")
# Load environment variables from .env file
load_dotenv() load_dotenv()
AI_URL = os.getenv("OLLAMA_API") # match .env and Docker ENV (e.g., http://localhost:11434/api/generate)
# Base API setup from .env (e.g., http://localhost:11434/api)
BASE_API = os.getenv("OLLAMA_API").rstrip("/") # Remove trailing slash just in case
# API endpoints for different Ollama operations
GEN_ENDPOINT = f"{BASE_API}/generate"
PULL_ENDPOINT = f"{BASE_API}/pull"
# UNLOAD_ENDPOINT is not used because unloading is done via `generate` with keep_alive=0
TAGS_ENDPOINT = f"{BASE_API}/tags"
# Startup model and debug toggle from .env
MODEL_NAME = os.getenv("MODEL_NAME", "llama3:latest") MODEL_NAME = os.getenv("MODEL_NAME", "llama3:latest")
SHOW_THINKING_BLOCKS = os.getenv("SHOW_THINKING_BLOCKS", "false").lower() == "true"
if not AI_URL: # Ensure API base is configured
logger.error("❌ OLLAMA_API environment variable is not set.") if not BASE_API:
raise ValueError("❌ OLLAMA_API environment variable is not set.") logger.error("❌ OLLAMA_API not set.")
raise ValueError("❌ OLLAMA_API not set.")
# Returns current model from env/config
def get_model_name():
return MODEL_NAME
# Removes <think>...</think> blocks from the LLM response (used by some models)
def strip_thinking_block(text: str) -> str:
return re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL)
# Check if a model exists locally by calling /tags
def model_exists_locally(model_name: str) -> bool:
try:
resp = requests.get(TAGS_ENDPOINT)
return model_name in resp.text
except Exception as e:
logger.error(f"❌ Failed to check local models: {e}")
return False
# Attempt to pull (load) a model via Ollama's /pull endpoint
def load_model(model_name: str) -> bool:
try:
logger.info(f"🧠 Preloading model: {model_name}")
resp = requests.post(PULL_ENDPOINT, json={"name": model_name})
logger.info(f"📨 Ollama pull response: {resp.status_code} - {resp.text}")
return resp.status_code == 200
except Exception as e:
logger.error(f"❌ Exception during model load: {str(e)}")
return False
# Send an empty prompt to unload a model from VRAM safely using keep_alive: 0
def unload_model(model_name: str) -> bool:
try:
logger.info(f"🧹 Sending safe unload request for `{model_name}`")
payload = {
"model": model_name,
"keep_alive": 0 # Tells Ollama to remove the model from memory, not disk
}
resp = requests.post(GEN_ENDPOINT, json=payload)
logger.info(f"🧽 Ollama unload response: {resp.status_code} - {resp.text}")
return resp.status_code == 200
except Exception as e:
logger.error(f"❌ Exception during soft-unload: {str(e)}")
return False
# Shortcut for getting the current model (can be expanded later for dynamic switching)
def get_current_model():
return get_model_name()
# Main LLM interaction — injects personality and sends prompt to Ollama
def get_ai_response(user_prompt): def get_ai_response(user_prompt):
persona = load_persona() model_name = get_model_name()
load_model(model_name) # Ensures the model is pulled and ready
persona = load_persona()
if persona: if persona:
# Sanitize prompt injection # Clean fancy quotes and build final prompt with character injection
safe_inject = persona["prompt_inject"].replace("", "\"").replace("", "\"").replace("", "'") safe_inject = persona["prompt_inject"].replace("", "\"").replace("", "\"").replace("", "'")
full_prompt = f"{safe_inject}\nUser: {user_prompt}\n{persona['name']}:" full_prompt = f"{safe_inject}\nUser: {user_prompt}\n{persona['name']}:"
else: else:
full_prompt = user_prompt # fallback mode: just send the user's prompt full_prompt = user_prompt # fallback to raw prompt if no persona loaded
payload = { payload = {
"model": MODEL_NAME, "model": model_name, # 🔧 Suggested fix: previously hardcoded to MODEL_NAME
"prompt": full_prompt, "prompt": full_prompt,
"stream": False "stream": False
# optional: add "keep_alive": 300 to keep model warm
} }
#print("\n🛰 SENDING TO OLLAMA /api/generate") logger.info("🛰️ SENDING TO OLLAMA /generate")
logger.info("🛰️ SENDING TO OLLAMA /api/generate")
#print("Payload:", payload)
logger.info(f"Payload: {payload}") logger.info(f"Payload: {payload}")
try: try:
response = requests.post(AI_URL, json=payload) response = requests.post(GEN_ENDPOINT, json=payload)
#print("📨 Raw response:", response.text)
logger.info(f"📨 Raw response: {response.text}") logger.info(f"📨 Raw response: {response.text}")
if response.status_code == 200: if response.status_code == 200:
result = response.json() result = response.json()
return result.get("response", "[No message in response]") response_text = result.get("response", "[No message in response]")
return strip_thinking_block(response_text) if not SHOW_THINKING_BLOCKS else response_text
else: else:
return f"[Error {response.status_code}] {response.text}" return f"[Error {response.status_code}] {response.text}"
except Exception as e: except Exception as e:
return f"[Exception] {str(e)}" return f"[Exception] {str(e)}"

View file

@ -4,12 +4,45 @@ import os
import discord import discord
import yaml import yaml
from discord.ext import commands from discord.ext import commands
from textwrap import wrap
from dotenv import load_dotenv from dotenv import load_dotenv
import random
import yaml
from scheduler import start_scheduler
from logger import setup_logger
logger = setup_logger("bot")
from ai import unload_model, load_model, get_current_model
dotenv_path = os.path.join(os.path.dirname(__file__), '..', '.env') dotenv_path = os.path.join(os.path.dirname(__file__), '..', '.env')
load_dotenv(dotenv_path) load_dotenv(dotenv_path)
logger.info(f"🔍 Loaded MODEL_NAME from .env: {os.getenv('MODEL_NAME')}")
MODEL_NAME = os.getenv("MODEL_NAME", "llama3:latest")
logger.info(f"🔍 Loaded MODEL_NAME from .env: {MODEL_NAME}")
# 🧽 Try to unload any currently loaded model
logger.info(f"🧹 Attempting to clear VRAM before loading {MODEL_NAME}...")
unload_model(MODEL_NAME)
# 🚀 Load target model from .env
if load_model(MODEL_NAME):
logger.info(f"🚀 Model `{MODEL_NAME}` preloaded on startup.")
else:
logger.warning(f"⚠️ Failed to preload model `{MODEL_NAME}`.")
logger.info(f"✅ Final model in use: {MODEL_NAME}")
from ai import get_ai_response, load_model
MODEL_NAME = os.getenv("MODEL_NAME", "llama3:latest")
if load_model(MODEL_NAME):
logger.info(f"🚀 Model `{MODEL_NAME}` preloaded on startup.")
else:
logger.warning(f"⚠️ Failed to preload model `{MODEL_NAME}`.")
logger.info(f"✅ Final model in use: {MODEL_NAME}")
from ai import get_ai_response
from personality import apply_personality, set_persona from personality import apply_personality, set_persona
from discord.ext.commands import ( from discord.ext.commands import (
cooldown, cooldown,
@ -17,10 +50,6 @@ from discord.ext.commands import (
CooldownMapping, CooldownMapping,
CommandOnCooldown CommandOnCooldown
) )
import yaml
from scheduler import start_scheduler
from logger import setup_logger
logger = setup_logger("bot")
base_dir = os.path.dirname(__file__) base_dir = os.path.dirname(__file__)
settings_path = os.path.join(base_dir, "settings.yml") settings_path = os.path.join(base_dir, "settings.yml")
@ -32,8 +61,10 @@ ROAST_COOLDOWN_SECONDS = settings["cooldowns"]["roast"]
GLOBAL_COOLDOWN_SECONDS = settings["cooldowns"]["global"] GLOBAL_COOLDOWN_SECONDS = settings["cooldowns"]["global"]
COOLDOWN_MSG_TEMPLATE = settings["messages"]["cooldown"] COOLDOWN_MSG_TEMPLATE = settings["messages"]["cooldown"]
load_dotenv()
TOKEN = os.getenv("DISCORD_TOKEN") TOKEN = os.getenv("DISCORD_TOKEN")
if not TOKEN:
logger.error("❌ DISCORD_TOKEN not set in .env file.")
raise SystemExit("DISCORD_TOKEN not set.")
intents = discord.Intents.default() intents = discord.Intents.default()
intents.message_content = True intents.message_content = True
@ -44,8 +75,9 @@ bot = commands.Bot(command_prefix="!", intents=intents)
async def on_command_error(ctx, error): async def on_command_error(ctx, error):
if isinstance(error, CommandOnCooldown): if isinstance(error, CommandOnCooldown):
retry_secs = round(error.retry_after, 1) retry_secs = round(error.retry_after, 1)
msg = COOLDOWN_MSG_TEMPLATE.replace("{seconds}", str(retry_secs)) template = random.choice(COOLDOWN_MSG_TEMPLATE) if isinstance(COOLDOWN_MSG_TEMPLATE, list) else COOLDOWN_MSG_TEMPLATE
print("🕒 Chill, mortal. You must wait 11.6s before trying again. 😼") msg = template.replace("{seconds}", str(retry_secs))
logger.info(f"Command {ctx.command} on cooldown. Retry after {retry_secs} seconds.") logger.info(f"Command {ctx.command} on cooldown. Retry after {retry_secs} seconds.")
await ctx.send(msg) await ctx.send(msg)
else: else:
@ -68,17 +100,26 @@ async def ping(ctx):
await ctx.send("🏓 Pong!") await ctx.send("🏓 Pong!")
@bot.command() @bot.command()
async def chat(ctx, *, message): async def chat(ctx, *, prompt):
await ctx.send("🤖 Thinking...") await ctx.send("🤖 Thinking...")
reply = get_ai_response(message) reply = get_ai_response(prompt)
await ctx.send(reply) MAX_DISCORD_MESSAGE_LENGTH = 2000
# Split long replies into chunks that fit Discord limits
chunks = wrap(reply, MAX_DISCORD_MESSAGE_LENGTH)
# Log only if the response is being chunked
if len(chunks) > 1:
logger.warning(f"💬 Splitting response into {len(chunks)} chunks due to length.")
for chunk in chunks:
await ctx.send(chunk)
@bot.command() @bot.command()
async def setpersona(ctx, *, description): async def setpersona(ctx, *, description):
set_persona(description) set_persona(description)
await ctx.send("✅ Persona updated! New style will be used in replies.") await ctx.send("✅ Persona updated! New style will be used in replies.")
@bot.command(name='roast') @bot.command(name='roast')
@cooldown(rate=1, per=ROAST_COOLDOWN_SECONDS, type=BucketType.user) @cooldown(rate=1, per=ROAST_COOLDOWN_SECONDS, type=BucketType.user)
async def roast(ctx): async def roast(ctx):
@ -94,6 +135,71 @@ async def roast(ctx):
# Send the roast back to the channel # Send the roast back to the channel
await ctx.send(f"😼 {response}") await ctx.send(f"😼 {response}")
@bot.command(name="clearmodel")
async def clear_model(ctx):
from ai import unload_model, get_current_model
model = get_current_model()
success = unload_model(model)
msg = f"✅ Unloaded model: `{model}`" if success else f"❌ Failed to unload model: `{model}`"
await ctx.send(msg)
@bot.command(name="model")
async def current_model(ctx):
from ai import get_current_model
model = get_current_model()
await ctx.send(f"📦 Current model: `{model}`")
@bot.command(name="setmodel")
async def set_model(ctx, *, model_name):
from ai import get_current_model, load_model, unload_model
current_model = get_current_model()
if model_name == current_model:
return await ctx.send(f"⚠️ `{model_name}` is already active.")
await ctx.send(f"🔄 Switching from `{current_model}` to `{model_name}`…")
# 1) Soft-unload old model from VRAM only
if unload_model(current_model):
await ctx.send(f"🧽 Unloaded `{current_model}` from VRAM.")
else:
await ctx.send(f"⚠️ Couldnt unload `{current_model}` (it may not have been loaded).")
# 2) Load the new one
if not load_model(model_name):
return await ctx.send(f"❌ Failed to pull `{model_name}`. Make sure its in `ollama list`.")
# 3) Update runtime AND .env on disk
os.environ["MODEL_NAME"] = model_name
env_path = os.path.join(os.path.dirname(__file__), '..', '.env')
# Read and rewrite .env
lines = []
with open(env_path, 'r', encoding='utf-8') as f:
for line in f:
if line.startswith("MODEL_NAME="):
lines.append(f"MODEL_NAME={model_name}\n")
else:
lines.append(line)
with open(env_path, 'w', encoding='utf-8') as f:
f.writelines(lines)
await ctx.send(f"✅ Model switched to `{model_name}` and `.env` updated.")
@bot.command(name="models")
async def list_models(ctx):
import requests
from ai import TAGS_ENDPOINT
try:
resp = requests.get(TAGS_ENDPOINT)
models = [m["name"] for m in resp.json().get("models", [])]
if models:
await ctx.send("🧠 Available models:\n" + "\n".join(f"- `{m}`" for m in models))
else:
await ctx.send("❌ No models found.")
except Exception as e:
await ctx.send(f"❌ Failed to fetch models: {e}")
@bot.event @bot.event
async def on_ready(): async def on_ready():
print(f"✅ Logged in as {bot.user.name}") print(f"✅ Logged in as {bot.user.name}")

View file

@ -4,7 +4,7 @@ cooldowns:
messages: messages:
cooldown: cooldown:
- "🕒 Chill, mortal. You must wait {seconds}s before trying again. 😼" - "🕒 Chill, wait {seconds}s before trying again."
scheduler: scheduler:
enabled: false enabled: false