apollo-v1-7b / USAGE_GUIDE.md

🚀 Apollo V1 7B: Advanced Reasoning Language Model - Complete model release with LoRA fine-tuned Mistral-7B-Instruct-v0.2, 161M parameter adapter, Apache 2.0 license, and professional documentation for logical, mathematical, and legal reasoning

a2897eb verified about 2 months ago

preview code

raw

history blame

8.42 kB

Apollo V1 7B Usage Guide

Installation & Setup

Requirements

pip install transformers>=4.44.0 peft>=0.12.0 torch>=2.0.0

Basic Setup

from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
import torch

# Load model (adjust device_map based on your hardware)
model = AutoPeftModelForCausalLM.from_pretrained(
    "vanta-research/apollo-v1-7b",
    torch_dtype=torch.float16,
    device_map="auto"  # or "cpu" for CPU-only
)

tokenizer = AutoTokenizer.from_pretrained("vanta-research/apollo-v1-7b")

Usage Patterns

1. Mathematical Problem Solving

def solve_math_problem(problem):
    prompt = f"Solve this step by step: {problem}"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    outputs = model.generate(
        **inputs,
        max_length=400,
        temperature=0.1,  # Low temperature for accuracy
        do_sample=True,
        top_p=0.9
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Examples
problems = [
    "What is 15% of 240?",
    "If x + 5 = 12, what is x?", 
    "A rectangle has length 8 and width 5. What is its area?"
]

for problem in problems:
    solution = solve_math_problem(problem)
    print(f"Problem: {problem}")
    print(f"Solution: {solution}")
    print("-" * 50)

2. Legal Reasoning

def analyze_legal_scenario(scenario):
    prompt = f"Analyze this legal scenario: {scenario}"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    outputs = model.generate(
        **inputs,
        max_length=600,
        temperature=0.2,  # Slightly higher for nuanced analysis
        repetition_penalty=1.1
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example legal scenarios
scenarios = [
    "A contract requires payment within 30 days, but the buyer received defective goods.",
    "Police conducted a search without a warrant, claiming exigent circumstances.",
    "An employee was fired for social media posts made outside work hours."
]

for scenario in scenarios:
    analysis = analyze_legal_scenario(scenario)
    print(f"Scenario: {scenario}")
    print(f"Analysis: {analysis}")
    print("-" * 50)

3. Logical Reasoning

def solve_logic_puzzle(puzzle):
    prompt = f"Solve this logic puzzle step by step: {puzzle}"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    outputs = model.generate(
        **inputs,
        max_length=500,
        temperature=0.1,
        top_k=50
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example logic puzzles
puzzles = [
    "If all A are B, and all B are C, what can we conclude about A and C?",
    "All cats are animals. Some animals are pets. Can we conclude all cats are pets?",
    "If it rains, the ground gets wet. The ground is wet. Did it rain?"
]

for puzzle in puzzles:
    solution = solve_logic_puzzle(puzzle)
    print(f"Puzzle: {puzzle}")
    print(f"Solution: {solution}")
    print("-" * 50)

Advanced Usage

Batch Processing

def batch_process_questions(questions, batch_size=4):
    results = []
    
    for i in range(0, len(questions), batch_size):
        batch = questions[i:i+batch_size]
        
        # Process batch
        batch_results = []
        for question in batch:
            inputs = tokenizer(question, return_tensors="pt")
            outputs = model.generate(**inputs, max_length=300)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            batch_results.append(response)
        
        results.extend(batch_results)
    
    return results

Memory Optimization

# For limited GPU memory
import torch

def memory_efficient_generation(prompt, max_length=400):
    with torch.no_grad():
        inputs = tokenizer(prompt, return_tensors="pt")
        
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=0.1,
            use_cache=True,  # Enable KV caching
            pad_token_id=tokenizer.eos_token_id
        )
        
        # Clear cache after generation
        if hasattr(model, 'past_key_values'):
            model.past_key_values = None
        
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

Custom Prompting

def create_apollo_prompt(question, context="", task_type="general"):
    """Create optimized prompts for different task types."""
    
    task_prompts = {
        "math": "Solve this mathematical problem step by step:",
        "legal": "Analyze this legal scenario considering relevant laws and precedents:",
        "logic": "Solve this logical reasoning problem step by step:",
        "general": "Please provide a clear and detailed response to:"
    }
    
    task_prompt = task_prompts.get(task_type, task_prompts["general"])
    
    if context:
        full_prompt = f"Context: {context}

{task_prompt} {question}"
    else:
        full_prompt = f"{task_prompt} {question}"
    
    return full_prompt

# Usage
question = "What is 25% of 160?"
prompt = create_apollo_prompt(question, task_type="math")

Performance Optimization

GPU Settings

# For RTX 3060 (12GB) or similar
model = AutoPeftModelForCausalLM.from_pretrained(
    "vanta-research/apollo-v1-7b",
    torch_dtype=torch.float16,
    device_map="auto",
    max_memory={0: "10GB"}  # Reserve some GPU memory
)

CPU Inference

# For CPU-only inference
model = AutoPeftModelForCausalLM.from_pretrained(
    "vanta-research/apollo-v1-7b",
    torch_dtype=torch.float32,  # Use float32 for CPU
    device_map="cpu"
)

Quantization (Coming Soon)

# 8-bit quantization for reduced memory usage
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

model = AutoPeftModelForCausalLM.from_pretrained(
    "vanta-research/apollo-v1-7b",
    quantization_config=quantization_config
)

Integration Examples

FastAPI Server

from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

class QuestionRequest(BaseModel):
    question: str
    task_type: str = "general"
    max_length: int = 400

@app.post("/ask")
async def ask_apollo(request: QuestionRequest):
    prompt = create_apollo_prompt(request.question, task_type=request.task_type)
    response = memory_efficient_generation(prompt, request.max_length)
    
    return {
        "question": request.question,
        "response": response,
        "task_type": request.task_type
    }

# Run with: uvicorn app:app --host 0.0.0.0 --port 8000

Gradio Interface

import gradio as gr

def apollo_interface(message, task_type):
    prompt = create_apollo_prompt(message, task_type=task_type)
    return memory_efficient_generation(prompt)

interface = gr.Interface(
    fn=apollo_interface,
    inputs=[
        gr.Textbox(label="Your Question"),
        gr.Dropdown(["general", "math", "legal", "logic"], label="Task Type")
    ],
    outputs=gr.Textbox(label="Apollo's Response"),
    title="Apollo V1 7B Chat",
    description="Chat with Apollo V1 7B - Advanced Reasoning AI"
)

interface.launch(share=True)

Troubleshooting

Common Issues

Out of Memory: Reduce batch size, use CPU inference, or enable memory optimization
Slow Generation: Check device placement, enable caching, optimize prompt length
Poor Quality: Adjust temperature (lower for factual, higher for creative)

Performance Tips

Use torch.compile() for faster inference (PyTorch 2.0+)
Enable gradient checkpointing for memory efficiency
Use appropriate data types (float16 for GPU, float32 for CPU)
Optimize prompt length and structure
Consider quantization for resource-constrained environments

Best Practices

Prompt Engineering: Be specific and clear in your questions
Temperature Settings: Use 0.1-0.2 for factual/mathematical tasks, 0.3-0.7 for creative tasks
Context Management: Provide relevant context for complex scenarios
Verification: Always verify critical information, especially for legal/financial advice
Ethical Usage: Use responsibly and within intended capabilities

For more examples and advanced usage patterns, see the GitHub repository and documentation.