import torch
import numpy as np
import cv2
from PIL import Image
from diffusers import (
StableDiffusionControlNetPipeline,
ControlNetModel,
UniPCMultistepScheduler
)from controlnet_aux import (
CannyDetector,
OpenposeDetector,
MidasDetector,
HEDdetector,
MLSDdetector,
LineartDetector,
LineartAnimeDetector
)from transformers import pipeline
Complete Guide to Stable Diffusion with ControlNet
Introduction
ControlNet is a neural network architecture that allows you to control Stable Diffusion image generation with additional input conditions like edge maps, depth maps, poses, and more. It provides precise control over the composition, structure, and layout of generated images while maintaining the creative power of diffusion models.
Key Benefits
- Precise Control: Direct influence over image structure and composition
- Consistency: Maintain specific poses, edges, or layouts across generations
- Flexibility: Multiple conditioning types for different use cases
- Quality: Enhanced output quality with structured guidance
Installation & Setup
Environment Setup
# Create conda environment
conda create -n controlnet python=3.10
conda activate controlnet
# Install PyTorch with CUDA support
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# Install core dependencies
pip install diffusers transformers accelerate
pip install controlnet-aux
pip install opencv-python
pip install xformers # Optional but recommended for performance
Required Libraries
Basic Setup Function
def setup_controlnet_pipeline(controlnet_type="canny", model_id="runwayml/stable-diffusion-v1-5"):
"""
Setup ControlNet pipeline with specified type and model
Args:
controlnet_type: Type of ControlNet ('canny', 'openpose', 'depth', etc.)
model_id: Base Stable Diffusion model to use
Returns:
Configured pipeline
"""
# ControlNet model mapping
= {
controlnet_models "canny": "lllyasviel/sd-controlnet-canny",
"openpose": "lllyasviel/sd-controlnet-openpose",
"depth": "lllyasviel/sd-controlnet-depth",
"hed": "lllyasviel/sd-controlnet-hed",
"mlsd": "lllyasviel/sd-controlnet-mlsd",
"normal": "lllyasviel/sd-controlnet-normal-map",
"scribble": "lllyasviel/sd-controlnet-scribble",
"seg": "lllyasviel/sd-controlnet-seg"
}
# Load ControlNet
= ControlNetModel.from_pretrained(
controlnet
controlnet_models[controlnet_type],=torch.float16
torch_dtype
)
# Create pipeline
= StableDiffusionControlNetPipeline.from_pretrained(
pipe
model_id,=controlnet,
controlnet=torch.float16,
torch_dtype=None,
safety_checker=False
requires_safety_checker
)
# Optimize for GPU
= pipe.to("cuda")
pipe = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.scheduler
# Enable memory efficient attention
pipe.enable_model_cpu_offload()
pipe.enable_xformers_memory_efficient_attention()
return pipe
Understanding ControlNet
Core Concept
ControlNet works by adding additional neural network layers to Stable Diffusion that process conditioning inputs (like edge maps or poses) and inject this information into the generation process. The original model weights remain frozen while the ControlNet layers learn to translate conditioning inputs into meaningful guidance.
ControlNet maintains the original Stable Diffusion weights while adding trainable layers that process conditioning inputs and inject control signals at multiple resolution levels in the UNet architecture.
Architecture Overview
class ControlNetArchitecture:
"""
Conceptual overview of ControlNet architecture
"""
def __init__(self):
self.encoder_layers = [] # Process conditioning input
self.zero_convolutions = [] # Ensure training stability
self.connection_layers = [] # Connect to UNet blocks
def forward(self, x_noisy, timestep, conditioning_input):
# Process conditioning input through encoder
= self.process_conditioning(conditioning_input)
control_features
# Apply zero convolutions for stable training
= self.apply_zero_convs(control_features)
control_features
# Inject into UNet at multiple resolution levels
return self.inject_control(x_noisy, timestep, control_features)
Basic Implementation
Canny Edge Control
Canny edge detection provides structural control based on edges in the input image.
def generate_with_canny(pipe, image_path, prompt, negative_prompt="", num_inference_steps=20):
"""
Generate image using Canny edge control
"""
# Load and preprocess image
= Image.open(image_path)
original_image = original_image.resize((512, 512))
original_image
# Create Canny detector
= CannyDetector()
canny_detector
# Generate Canny edge map
= canny_detector(original_image)
canny_image
# Generate image
= pipe(
result =prompt,
prompt=canny_image,
image=negative_prompt,
negative_prompt=num_inference_steps,
num_inference_steps=7.5,
guidance_scale=1.0
controlnet_conditioning_scale
)
return result.images[0], canny_image
# Example usage
= setup_controlnet_pipeline("canny")
pipe = "a beautiful landscape painting, oil painting style, vibrant colors"
prompt = generate_with_canny(pipe, "input.jpg", prompt) generated_image, control_image
OpenPose Human Pose Control
OpenPose allows control over human poses and body positions.
def generate_with_openpose(pipe, image_path, prompt, negative_prompt=""):
"""
Generate image using OpenPose control
"""
# Load image
= Image.open(image_path)
original_image = original_image.resize((512, 512))
original_image
# Create OpenPose detector
= OpenposeDetector.from_pretrained('lllyasviel/Annotators')
openpose_detector
# Generate pose keypoints
= openpose_detector(original_image)
pose_image
# Generate image with pose control
= pipe(
result =prompt,
prompt=pose_image,
image=negative_prompt,
negative_prompt=20,
num_inference_steps=7.5,
guidance_scale=1.0
controlnet_conditioning_scale
)
return result.images[0], pose_image
# Example usage
= setup_controlnet_pipeline("openpose")
pipe = "a robot dancing, futuristic style, neon lighting"
prompt = generate_with_openpose(pipe, "person_dancing.jpg", prompt) generated_image, pose_image
Depth Map Control
Depth maps provide 3D structure control for more realistic spatial relationships.
def generate_with_depth(pipe, image_path, prompt, negative_prompt=""):
"""
Generate image using depth map control
"""
# Load image
= Image.open(image_path)
original_image = original_image.resize((512, 512))
original_image
# Create depth estimator
= pipeline('depth-estimation')
depth_estimator
# Generate depth map
= depth_estimator(original_image)['depth']
depth = Image.fromarray(np.array(depth)).convert('RGB')
depth_image
# Generate image with depth control
= pipe(
result =prompt,
prompt=depth_image,
image=negative_prompt,
negative_prompt=20,
num_inference_steps=7.5,
guidance_scale=1.0
controlnet_conditioning_scale
)
return result.images[0], depth_image
Advanced ControlNet Types
Line Art Control
Perfect for anime-style generation and clean line art conversion.
def setup_lineart_pipeline():
"""
Setup pipeline for line art control
"""
= ControlNetModel.from_pretrained(
controlnet "lllyasviel/sd-controlnet-lineart",
=torch.float16
torch_dtype
)
= StableDiffusionControlNetPipeline.from_pretrained(
pipe "runwayml/stable-diffusion-v1-5",
=controlnet,
controlnet=torch.float16
torch_dtype"cuda")
).to(
return pipe
def generate_with_lineart(pipe, image_path, prompt, anime_style=False):
"""
Generate using line art control
"""
= Image.open(image_path).resize((512, 512))
original_image
# Choose detector based on style
if anime_style:
= LineartAnimeDetector.from_pretrained('lllyasviel/Annotators')
detector else:
= LineartDetector.from_pretrained('lllyasviel/Annotators')
detector
= detector(original_image)
lineart_image
= pipe(
result =prompt,
prompt=lineart_image,
image=20,
num_inference_steps=7.5,
guidance_scale=1.0
controlnet_conditioning_scale
)
return result.images[0], lineart_image
Scribble Control
Allows rough sketches to guide generation.
def create_scribble_from_sketch(sketch_path):
"""
Process a rough sketch for scribble control
"""
= cv2.imread(sketch_path, 0)
sketch
# Apply threshold to create clean binary image
= cv2.threshold(sketch, 127, 255, cv2.THRESH_BINARY)
_, binary
# Convert to 3-channel RGB
= cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
scribble
return Image.fromarray(scribble)
def generate_with_scribble(pipe, scribble_image, prompt):
"""
Generate from scribble input
"""
= pipe(
result =prompt,
prompt=scribble_image,
image=20,
num_inference_steps=7.5,
guidance_scale=1.0
controlnet_conditioning_scale
)
return result.images[0]
Normal Map Control
Provides detailed surface normal information for realistic lighting.
def generate_normal_map(image_path):
"""
Generate normal map from image
"""
# Load depth estimator
= MidasDetector.from_pretrained('lllyasviel/Annotators')
depth_estimator
= Image.open(image_path).resize((512, 512))
image
# Generate depth map
= depth_estimator(image)
depth_map
# Convert depth to normal map (simplified)
= np.array(depth_map)
depth_array
# Calculate gradients
= cv2.Sobel(depth_array, cv2.CV_64F, 1, 0, ksize=3)
grad_x = cv2.Sobel(depth_array, cv2.CV_64F, 0, 1, ksize=3)
grad_y
# Create normal vectors
= -grad_x / 255.0
normal_x = -grad_y / 255.0
normal_y = np.ones_like(normal_x)
normal_z
# Normalize
= np.sqrt(normal_x**2 + normal_y**2 + normal_z**2)
length /= length
normal_x /= length
normal_y /= length
normal_z
# Convert to 0-255 range
= np.stack([
normal_map + 1) * 127.5).astype(np.uint8),
((normal_x + 1) * 127.5).astype(np.uint8),
((normal_y + 1) * 127.5).astype(np.uint8)
((normal_z =-1)
], axis
return Image.fromarray(normal_map)
Combining Multiple ControlNets
Combining multiple ControlNets allows for more sophisticated control by leveraging different types of conditioning simultaneously, such as pose + depth or edges + normal maps.
Multi-ControlNet Setup
def setup_multi_controlnet_pipeline(controlnet_types):
"""
Setup pipeline with multiple ControlNets
"""
from diffusers import MultiControlNetModel
= {
controlnet_models "canny": "lllyasviel/sd-controlnet-canny",
"openpose": "lllyasviel/sd-controlnet-openpose",
"depth": "lllyasviel/sd-controlnet-depth"
}
# Load multiple ControlNets
= [
controlnets =torch.float16)
ControlNetModel.from_pretrained(controlnet_models[ctype], torch_dtypefor ctype in controlnet_types
]
# Create multi-ControlNet
= MultiControlNetModel(controlnets)
multi_controlnet
# Create pipeline
= StableDiffusionControlNetPipeline.from_pretrained(
pipe "runwayml/stable-diffusion-v1-5",
=multi_controlnet,
controlnet=torch.float16
torch_dtype"cuda")
).to(
return pipe
def generate_with_multiple_controls(pipe, image_path, prompt):
"""
Generate using multiple control inputs
"""
= Image.open(image_path).resize((512, 512))
original_image
# Generate different control images
= CannyDetector()
canny_detector = OpenposeDetector.from_pretrained('lllyasviel/Annotators')
openpose_detector
= canny_detector(original_image)
canny_image = openpose_detector(original_image)
pose_image
# Generate with multiple controls
= pipe(
result =prompt,
prompt=[canny_image, pose_image],
image=20,
num_inference_steps=7.5,
guidance_scale=[1.0, 0.8] # Different weights for each control
controlnet_conditioning_scale
)
return result.images[0]
# Example usage
= setup_multi_controlnet_pipeline(["canny", "openpose"])
pipe = generate_with_multiple_controls(pipe, "input.jpg", "a cyberpunk warrior") result
Fine-tuning Parameters
Control Strength and Guidance
def advanced_generation_control(pipe, control_image, prompt, **kwargs):
"""
Advanced parameter control for fine-tuning generation
"""
# Default parameters
= {
params 'prompt': prompt,
'image': control_image,
'num_inference_steps': 20,
'guidance_scale': 7.5,
'controlnet_conditioning_scale': 1.0,
'control_guidance_start': 0.0,
'control_guidance_end': 1.0,
'eta': 0.0,
'generator': torch.manual_seed(42)
}
# Update with custom parameters
params.update(kwargs)
# Generate image
= pipe(**params)
result
return result.images[0]
# Examples of parameter variations
= [
variations # Strong control throughout
'controlnet_conditioning_scale': 1.5},
{
# Weak control for more creativity
'controlnet_conditioning_scale': 0.5},
{
# Control only in early steps
'control_guidance_end': 0.5},
{
# Control only in later steps
'control_guidance_start': 0.5},
{
# Higher guidance for more prompt adherence
'guidance_scale': 12.0},
{
# More inference steps for quality
'num_inference_steps': 50}
{ ]
Adaptive Control Strength
def adaptive_control_strength(pipe, control_image, prompt, complexity_factor=1.0):
"""
Automatically adjust control strength based on image complexity
"""
# Analyze control image complexity
= np.array(control_image.convert('L'))
control_array
# Calculate edge density as complexity measure
= cv2.Canny(control_array, 50, 150)
edges = np.sum(edges > 0) / edges.size
edge_density
# Adjust control strength based on complexity
= 1.0
base_strength if edge_density > 0.1: # High detail
= base_strength * 0.8 * complexity_factor
control_strength elif edge_density < 0.05: # Low detail
= base_strength * 1.2 * complexity_factor
control_strength else: # Medium detail
= base_strength * complexity_factor
control_strength
= pipe(
result =prompt,
prompt=control_image,
image=control_strength,
controlnet_conditioning_scale=20,
num_inference_steps=7.5
guidance_scale
)
return result.images[0], control_strength
Production Optimization
Memory Management
class OptimizedControlNetGenerator:
"""
Production-ready ControlNet generator with optimization
"""
def __init__(self, controlnet_type="canny", enable_cpu_offload=True):
self.pipe = setup_controlnet_pipeline(controlnet_type)
if enable_cpu_offload:
self.pipe.enable_model_cpu_offload()
# Enable memory efficient attention
self.pipe.enable_xformers_memory_efficient_attention()
# Compile model for faster inference (PyTorch 2.0+)
try:
self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True)
except:
print("Torch compile not available, skipping optimization")
def generate_batch(self, control_images, prompts, batch_size=4):
"""
Generate multiple images in batches for efficiency
"""
= []
results
for i in range(0, len(prompts), batch_size):
= prompts[i:i+batch_size]
batch_prompts = control_images[i:i+batch_size]
batch_images
# Clear cache before batch
torch.cuda.empty_cache()
= self.pipe(
batch_results =batch_prompts,
prompt=batch_images,
image=20,
num_inference_steps=7.5
guidance_scale
)
results.extend(batch_results.images)
return results
def generate_with_callback(self, control_image, prompt, callback=None):
"""
Generate with progress callback
"""
def progress_callback(step, timestep, latents):
if callback:
callback(step, timestep)
= self.pipe(
result =prompt,
prompt=control_image,
image=progress_callback,
callback=1
callback_steps
)
return result.images[0]
Caching and Preprocessing
import os
import hashlib
class ControlNetCache:
"""
Cache system for preprocessed control images
"""
def __init__(self, cache_dir="./controlnet_cache"):
self.cache_dir = cache_dir
=True)
os.makedirs(cache_dir, exist_okself.detectors = {}
def get_detector(self, detector_type):
"""
Lazy load and cache detectors
"""
if detector_type not in self.detectors:
= {
detector_map 'canny': CannyDetector(),
'openpose': OpenposeDetector.from_pretrained('lllyasviel/Annotators'),
'hed': HEDdetector.from_pretrained('lllyasviel/Annotators'),
'mlsd': MLSDdetector.from_pretrained('lllyasviel/Annotators')
}self.detectors[detector_type] = detector_map[detector_type]
return self.detectors[detector_type]
def get_control_image(self, image_path, control_type, force_refresh=False):
"""
Get control image with caching
"""
# Create cache key
= hashlib.md5(open(image_path, 'rb').read()).hexdigest()
image_hash = os.path.join(self.cache_dir, f"{image_hash}_{control_type}.png")
cache_path
# Check cache
if os.path.exists(cache_path) and not force_refresh:
return Image.open(cache_path)
# Generate control image
= Image.open(image_path).resize((512, 512))
original_image = self.get_detector(control_type)
detector = detector(original_image)
control_image
# Save to cache
control_image.save(cache_path)
return control_image
Troubleshooting
- GPU Memory: ControlNet models require significant GPU memory (8GB+ recommended)
- Image Format: Ensure control images are in RGB format and proper dimensions
- Model Compatibility: Match ControlNet models with compatible Stable Diffusion versions
Common Issues and Solutions
def diagnose_controlnet_issues(pipe, control_image, prompt):
"""
Diagnostic function for common ControlNet issues
"""
= []
issues
# Check control image format
if control_image.mode != 'RGB':
"Control image should be RGB format")
issues.append(= control_image.convert('RGB')
control_image
# Check image size
if control_image.size != (512, 512):
f"Control image size {control_image.size} != (512, 512)")
issues.append(= control_image.resize((512, 512))
control_image
# Check GPU memory
if torch.cuda.is_available():
= torch.cuda.memory_allocated() / 1e9
memory_allocated = torch.cuda.memory_reserved() / 1e9
memory_reserved
if memory_reserved > 10: # More than 10GB
f"High GPU memory usage: {memory_reserved:.1f}GB")
issues.append(
# Check prompt length
if len(prompt.split()) > 75:
"Very long prompt may cause issues")
issues.append(
if issues:
print("Detected issues:")
for issue in issues:
print(f"- {issue}")
return control_image
def memory_cleanup():
"""
Clean up GPU memory
"""
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
# Error handling wrapper
def safe_generate(pipe, control_image, prompt, max_retries=3):
"""
Generate with error handling and retries
"""
for attempt in range(max_retries):
try:
# Diagnose issues
= diagnose_controlnet_issues(pipe, control_image, prompt)
control_image
# Generate
= pipe(
result =prompt,
prompt=control_image,
image=20,
num_inference_steps=7.5
guidance_scale
)
return result.images[0]
except RuntimeError as e:
if "out of memory" in str(e).lower():
print(f"GPU OOM on attempt {attempt + 1}, cleaning memory...")
memory_cleanup()
if attempt == max_retries - 1:
raise e
else:
raise e
except Exception as e:
print(f"Unexpected error on attempt {attempt + 1}: {e}")
if attempt == max_retries - 1:
raise e
return None
Performance Benchmarking
import time
from contextlib import contextmanager
@contextmanager
def timer():
"""
Simple timing context manager
"""
= time.time()
start yield
= time.time()
end print(f"Execution time: {end - start:.2f} seconds")
def benchmark_controlnet(pipe, control_image, prompt, runs=5):
"""
Benchmark ControlNet performance
"""
= []
times
# Warmup
= pipe(prompt=prompt, image=control_image, num_inference_steps=5)
_
# Benchmark runs
for i in range(runs):
= time.time()
start_time = pipe(
result =prompt,
prompt=control_image,
image=20,
num_inference_steps=7.5
guidance_scale
)= time.time()
end_time - start_time)
times.append(end_time
= sum(times) / len(times)
avg_time print(f"Average generation time: {avg_time:.2f} seconds")
print(f"Images per minute: {60 / avg_time:.1f}")
return result.images[0]
Best Practices Summary
- Memory Management: Use CPU offloading and memory efficient attention for large models
- Preprocessing: Cache control images when generating multiple variations
- Parameter Tuning: Adjust
controlnet_conditioning_scale
based on desired control strength - Quality vs Speed: Balance
num_inference_steps
with generation time requirements - Multi-Control: Use different conditioning scales when combining multiple ControlNets
- Error Handling: Implement robust error handling for production systems
- Optimization: Use torch.compile() and xformers for performance improvements
Parameter Reference Table
Parameter | Range | Effect |
---|---|---|
controlnet_conditioning_scale |
0.5-1.5 | Control strength |
guidance_scale |
5.0-15.0 | Prompt adherence |
num_inference_steps |
10-50 | Quality vs speed |
control_guidance_start |
0.0-0.5 | When control starts |
control_guidance_end |
0.5-1.0 | When control ends |
This comprehensive guide provides everything needed to implement Stable Diffusion with ControlNet, from basic usage to production-ready systems. The modular structure allows for easy customization and extension based on specific requirements.