The 80/20 insight that drives hybrid architectures. In most production systems, roughly 80% of incoming requests are straightforward cases that a fast, cheap classifier can handle correctly. The remaining 20% are complex, ambiguous, or novel cases that genuinely benefit from LLM reasoning. A well-designed hybrid pipeline routes each request to the cheapest model capable of handling it correctly, achieving LLM-level quality at a fraction of the cost. This section covers the major hybrid patterns: classical triage with LLM escalation, ensemble voting, cascading model architectures, and the router pattern.
1. Pattern: Classical Triage + LLM Escalation
The triage pattern is the most common hybrid architecture in production. A fast, cheap classifier processes every incoming request. When the classifier is confident in its prediction, the result is returned directly. When the classifier is uncertain (low confidence), the request is escalated to an LLM for more careful analysis.
1.1 Implementing Confidence-Based Routing
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from dataclasses import dataclass
@dataclass
class TriageResult:
category: str
confidence: float
source: str # "classifier" or "llm"
cost: float
class TriageRouter:
"""Routes requests to classifier or LLM based on confidence."""
def __init__(self, confidence_threshold: float = 0.85):
self.threshold = confidence_threshold
self.vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
self.classifier = LogisticRegression(max_iter=1000)
self.is_fitted = False
def fit(self, texts: list[str], labels: list[str]):
"""Train the fast classifier."""
X = self.vectorizer.fit_transform(texts)
self.classifier.fit(X, labels)
self.is_fitted = True
def classify(self, text: str) -> TriageResult:
"""Route to classifier or LLM based on confidence."""
# Step 1: Get classifier prediction and confidence
X = self.vectorizer.transform([text])
probas = self.classifier.predict_proba(X)[0]
max_confidence = probas.max()
predicted_class = self.classifier.classes_[probas.argmax()]
# Step 2: Route based on confidence
if max_confidence >= self.threshold:
return TriageResult(
category=predicted_class,
confidence=max_confidence,
source="classifier",
cost=0.00001
)
else:
# Escalate to LLM (simulated here)
llm_result = self._call_llm(text)
return TriageResult(
category=llm_result,
confidence=0.95, # LLM confidence (estimated)
source="llm",
cost=0.003
)
def _call_llm(self, text: str) -> str:
"""Call LLM for complex cases (simplified)."""
# In production: call OpenAI / Anthropic API
return "complex_case"
# Train and evaluate
train_texts = [
"charged twice", "double charge", "billing error",
"app crashes", "error message", "won't load",
"change email", "update address", "password reset",
"package lost", "shipping delay", "not delivered",
"pricing info", "business hours", "return policy",
] * 20
train_labels = [
"billing", "billing", "billing",
"technical", "technical", "technical",
"account", "account", "account",
"shipping", "shipping", "shipping",
"general", "general", "general",
] * 20
router = TriageRouter(confidence_threshold=0.85)
router.fit(train_texts, train_labels)
# Test with various difficulty levels
test_cases = [
"I was charged twice on my credit card", # Easy: billing
"The page gives me a 500 error", # Easy: technical
"I want to change my account email and also get a refund", # Hard: mixed
"Your competitor offers better rates", # Hard: ambiguous
]
print("Triage Results:")
print("-" * 70)
for text in test_cases:
result = router.classify(text)
print(f" Text: '{text[:50]}...'")
print(f" Category: {result.category} | "
f"Confidence: {result.confidence:.2f} | "
f"Source: {result.source} | "
f"Cost: ${result.cost:.5f}")
print()
2. Pattern: Ensemble Voting
In the ensemble pattern, multiple models (both classical and LLM) independently classify the same input, and their predictions are combined through a voting or weighting scheme. This pattern improves robustness: if one model makes an error, the others can outvote it. The tradeoff is that every request incurs the cost of all models.
from dataclasses import dataclass
from collections import Counter
@dataclass
class ModelPrediction:
model_name: str
prediction: str
confidence: float
cost: float
def ensemble_vote(predictions: list[ModelPrediction],
strategy: str = "confidence_weighted") -> str:
"""Combine predictions from multiple models."""
if strategy == "majority":
# Simple majority vote
votes = [p.prediction for p in predictions]
return Counter(votes).most_common(1)[0][0]
elif strategy == "confidence_weighted":
# Weight each vote by model confidence
scores = {}
for p in predictions:
scores[p.prediction] = scores.get(p.prediction, 0) + p.confidence
return max(scores, key=scores.get)
elif strategy == "cost_aware":
# Trust expensive models more when they disagree with cheap ones
cheap = [p for p in predictions if p.cost < 0.001]
expensive = [p for p in predictions if p.cost >= 0.001]
if cheap and expensive:
cheap_pred = Counter([p.prediction for p in cheap]).most_common(1)[0][0]
exp_pred = Counter([p.prediction for p in expensive]).most_common(1)[0][0]
if cheap_pred == exp_pred:
return cheap_pred # Agreement: trust it
else:
return exp_pred # Disagreement: trust expensive model
return Counter([p.prediction for p in predictions]).most_common(1)[0][0]
# Simulate ensemble predictions
predictions = [
ModelPrediction("TF-IDF+LR", "billing", 0.82, 0.00001),
ModelPrediction("BERT-base", "billing", 0.91, 0.0005),
ModelPrediction("GPT-4o-mini", "billing", 0.95, 0.003),
]
for strategy in ["majority", "confidence_weighted", "cost_aware"]:
result = ensemble_vote(predictions, strategy=strategy)
total_cost = sum(p.cost for p in predictions)
print(f" {strategy:25s} => {result} (total cost: ${total_cost:.5f})")
Ensembles run all models on every request, so they are best suited for high-stakes tasks where accuracy matters more than cost (medical diagnosis, legal document review, financial compliance). The triage pattern is better for high-volume, cost-sensitive applications where most requests are straightforward. In practice, many production systems use a hybrid of both: triage for easy cases, ensemble for the uncertain ones that get escalated.
3. Pattern: Cascading Model Architecture
The cascade pattern sends every request through a sequence of increasingly powerful (and expensive) models, stopping as soon as any model is confident enough. This differs from triage (which uses a single classifier for routing) by using progressively more capable models at each stage.
from dataclasses import dataclass
from typing import Optional
@dataclass
class CascadeResult:
prediction: str
confidence: float
tier: int
total_cost: float
total_latency_ms: float
class ModelCascade:
"""Cascading model architecture: small -> medium -> large."""
def __init__(self, confidence_thresholds: list[float]):
self.thresholds = confidence_thresholds
# In production, these would be real model instances
self.tiers = [
{"name": "Regex/Rules", "cost": 0.0, "latency_ms": 0.01},
{"name": "BERT-tiny", "cost": 0.0001, "latency_ms": 5},
{"name": "GPT-4o-mini", "cost": 0.003, "latency_ms": 400},
]
def predict(self, text: str) -> CascadeResult:
total_cost = 0.0
total_latency = 0.0
for i, (tier, threshold) in enumerate(
zip(self.tiers, self.thresholds + [0.0])
):
total_cost += tier["cost"]
total_latency += tier["latency_ms"]
# Simulate prediction (in production: actual model call)
prediction, confidence = self._run_tier(i, text)
if confidence >= threshold or i == len(self.tiers) - 1:
return CascadeResult(
prediction=prediction,
confidence=confidence,
tier=i + 1,
total_cost=total_cost,
total_latency_ms=total_latency,
)
def _run_tier(self, tier: int, text: str) -> tuple[str, float]:
"""Simulate tier prediction. Replace with real models."""
import random
random.seed(hash(text) + tier)
if tier == 0: # Regex
if any(kw in text.lower() for kw in ["charged", "refund", "bill"]):
return "billing", 0.99
return "unknown", 0.30
elif tier == 1: # Small model
return "billing", 0.75 + random.random() * 0.2
else: # Large LLM
return "billing", 0.95
# Demo
cascade = ModelCascade(confidence_thresholds=[0.90, 0.85])
test_texts = [
"I was charged twice on my bill", # Regex catches this
"The interface feels sluggish lately", # Needs small model
"I have a complex multi-part question", # Needs LLM
]
print("Cascade Routing Results:")
print("=" * 65)
for text in test_texts:
result = cascade.predict(text)
print(f" '{text[:45]}'")
print(f" Tier {result.tier} | Conf: {result.confidence:.2f} | "
f"Cost: ${result.total_cost:.5f} | "
f"Latency: {result.total_latency_ms:.1f}ms\n")
4. Pattern: LLM Router
The router pattern uses a lightweight model (or even an LLM itself) to analyze the incoming request and decide which model should handle it. Unlike the cascade, which always starts at the cheapest tier, the router can skip directly to the appropriate tier based on the request complexity.
import openai
import json
client = openai.OpenAI()
def route_request(text: str) -> dict:
"""Use a small LLM to decide which model should handle a request."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": """Analyze this request and decide
which model tier should handle it. Return JSON:
- tier: "regex" for simple pattern matching (dates, emails, numbers)
- tier: "classifier" for standard classification with clear categories
- tier: "small_llm" for moderate complexity needing some reasoning
- tier: "large_llm" for complex, ambiguous, or multi-step reasoning
Also return:
- reasoning: one sentence explaining why
- estimated_difficulty: 1-5 scale"""},
{"role": "user", "content": text}
],
response_format={"type": "json_object"},
temperature=0,
max_tokens=100,
)
return json.loads(response.choices[0].message.content)
# Example routing decisions
requests = [
"Extract all email addresses from this text: contact us at info@corp.com",
"Is this customer review positive or negative: 'Great product!'",
"Summarize the key points from this 3-page contract",
"Given our Q3 financials and market trends, should we expand to Europe?",
]
print("Router Decisions:")
print("=" * 70)
for req in requests:
decision = route_request(req)
print(f" Request: '{req[:55]}...'")
print(f" Tier: {decision['tier']} | "
f"Difficulty: {decision['estimated_difficulty']}/5")
print(f" Reason: {decision['reasoning']}\n")
Using an LLM as the router adds cost to every single request. If the router itself costs $0.0003 per call, you need the routing savings to exceed this overhead. For high-volume systems, train a small BERT classifier as the router instead, or use simple heuristics (input length, keyword presence, question complexity score) to avoid the LLM router cost entirely.
5. Lab: Building a Customer Support Pipeline
Let us put these patterns together into a complete customer support pipeline that combines a classifier for routing, an LLM for complex extraction, and a rules engine for execution.
from dataclasses import dataclass, field
from typing import Optional
import json
@dataclass
class TicketAnalysis:
ticket_id: str
raw_text: str
category: str
routing_tier: str
extracted_info: dict = field(default_factory=dict)
action: str = ""
total_cost: float = 0.0
class CustomerSupportPipeline:
"""Three-stage pipeline: classify -> extract -> execute."""
def __init__(self):
self.simple_categories = {"billing", "shipping", "account"}
self.actions = {
"billing": "initiate_refund_review",
"shipping": "create_tracking_inquiry",
"account": "send_account_update_link",
"technical": "create_engineering_ticket",
"general": "route_to_agent",
}
def process_ticket(self, ticket_id: str, text: str) -> TicketAnalysis:
analysis = TicketAnalysis(ticket_id=ticket_id, raw_text=text,
category="", routing_tier="")
# Stage 1: Fast classification
category, confidence = self._classify(text)
analysis.category = category
if confidence > 0.85 and category in self.simple_categories:
# Stage 2a: Rule-based extraction for simple cases
analysis.routing_tier = "classifier + rules"
analysis.extracted_info = self._rule_extract(text, category)
analysis.total_cost = 0.00001
else:
# Stage 2b: LLM extraction for complex cases
analysis.routing_tier = "classifier + llm"
analysis.extracted_info = self._llm_extract(text)
analysis.total_cost = 0.005
# Stage 3: Determine action
analysis.action = self.actions.get(analysis.category, "route_to_agent")
return analysis
def _classify(self, text: str) -> tuple[str, float]:
"""Fast keyword classifier (replace with trained model)."""
keywords = {
"billing": ["charge", "refund", "payment", "invoice", "bill"],
"shipping": ["package", "delivery", "tracking", "shipped"],
"account": ["password", "email", "login", "account"],
"technical": ["error", "crash", "bug", "broken", "slow"],
}
text_lower = text.lower()
for cat, words in keywords.items():
if any(w in text_lower for w in words):
return cat, 0.90
return "general", 0.50
def _rule_extract(self, text: str, category: str) -> dict:
"""Simple rule-based extraction for common patterns."""
import re
info = {}
amounts = re.findall(r'\$[\d,]+\.?\d*', text)
if amounts:
info["amounts"] = amounts
dates = re.findall(r'\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{2,4}', text)
if dates:
info["dates"] = dates
return info
def _llm_extract(self, text: str) -> dict:
"""LLM extraction for complex cases (simulated)."""
return {
"urgency": "high",
"sentiment": -0.7,
"key_issues": ["billing dispute", "service cancellation"],
"requires_human": True,
}
# Process sample tickets
pipeline = CustomerSupportPipeline()
tickets = [
("T-001", "Please refund $49.99 charged on 2025-01-15"),
("T-002", "My package tracking number XY123 shows no updates"),
("T-003", "Your AI assistant gave me wrong medical advice and I "
"want to speak to a manager about this serious issue"),
]
print("Customer Support Pipeline Results:")
print("=" * 65)
for tid, text in tickets:
result = pipeline.process_ticket(tid, text)
print(f" Ticket: {result.ticket_id}")
print(f" Category: {result.category}")
print(f" Routing: {result.routing_tier}")
print(f" Extracted: {result.extracted_info}")
print(f" Action: {result.action}")
print(f" Cost: ${result.total_cost:.5f}")
print()
Knowledge Check
Show Answer
Show Answer
Show Answer
Show Answer
Show Answer
Key Takeaways
- The triage pattern (classifier handles 80% of requests, LLM handles 20%) is the most common and effective hybrid architecture for cost-sensitive production systems.
- Ensemble voting improves robustness for high-stakes tasks by combining predictions from multiple models, at the cost of running all models on every request.
- Cascading architectures (regex, then small model, then large LLM) offer fine-grained cost control with multiple escalation tiers.
- LLM-based routers are powerful but expensive; consider lightweight classifiers or heuristics for the routing decision to avoid the router cost trap.
- Production pipelines typically combine patterns: triage for routing, rules for simple extraction, LLM for complex analysis, and a rules engine for action execution.