Code for How to Control Generated Images by Diffusion Models via ControlNet in Python Tutorial

controlnet_pythoncodetutorial.py
# %%
!pip install -qU xformers diffusers transformers accelerate

# %%
!pip install -qU  controlnet_aux
!pip install opencv-contrib-python

# %% [markdown]
# # Open Pose

# %%
from PIL import Image
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
import torch
from controlnet_aux import OpenposeDetector
from diffusers.utils import load_image
from tqdm import tqdm
from torch import autocast

# %%
# load the openpose model
openpose = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')

# load the controlnet for openpose
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16
)

# define stable diffusion pipeline with controlnet
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

# %%
# Remove if you do not have xformers installed
# see https://huggingface.co/docs/diffusers/v0.13.0/en/optimization/xformers#installing-xformers
# for installation instructions
pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()

# %%
image_input = load_image("https://cdn.pixabay.com/photo/2016/05/17/22/19/fashion-1399344_640.jpg")
image_input

# %%
image_pose = openpose(image_input)
image_pose

# %%
image_output = pipe("A professional photograph of a male fashion model", image_pose, num_inference_steps=20).images[0]
image_output

# %% [markdown]
# # Custom implementation

# %%
class ControlNetDiffusionPipelineCustom:
    """custom implementation of the ControlNet Diffusion Pipeline"""

    def __init__(self,
                 vae,
                 tokenizer,
                 text_encoder,
                 unet,
                 controlnet,
                 scheduler,
                 image_processor,
                 control_image_processor):

        self.vae = vae
        self.tokenizer = tokenizer
        self.text_encoder = text_encoder
        self.unet = unet
        self.scheduler = scheduler
        self.controlnet = controlnet
        self.image_processor = image_processor
        self.control_image_processor = control_image_processor
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'



    def get_text_embeds(self, text):
        """returns embeddings for the given `text`"""

        # tokenize the text
        text_input = self.tokenizer(text,
                                    padding='max_length',
                                    max_length=tokenizer.model_max_length,
                                    truncation=True,
                                    return_tensors='pt')
        # embed the text
        with torch.no_grad():
            text_embeds = self.text_encoder(text_input.input_ids.to(self.device))[0]
        return text_embeds



    def get_prompt_embeds(self, prompt):
        """returns prompt embeddings based on classifier free guidance"""

        if isinstance(prompt, str):
            prompt = [prompt]
        # get conditional prompt embeddings
        cond_embeds = self.get_text_embeds(prompt)
        # get unconditional prompt embeddings
        uncond_embeds = self.get_text_embeds([''] * len(prompt))
        # concatenate the above 2 embeds
        prompt_embeds = torch.cat([uncond_embeds, cond_embeds])
        return prompt_embeds


    def transform_image(self, image):
        """convert image from pytorch tensor to PIL format"""

        image = self.image_processor.postprocess(image, output_type='pil')
        return image



    def get_initial_latents(self, height, width, num_channels_latents, batch_size):
        """returns noise latent tensor of relevant shape scaled by the scheduler"""

        image_latents = torch.randn((batch_size,
                                   num_channels_latents,
                                   height // 8,
                                   width // 8)).to(self.device)
        # scale the initial noise by the standard deviation required by the scheduler
        image_latents = image_latents * self.scheduler.init_noise_sigma
        return image_latents



    def denoise_latents(self,
                        prompt_embeds,
                        controlnet_image,
                        timesteps,
                        latents,
                        guidance_scale=7.5):
        """denoises latents from noisy latent to a meaningful latent as conditioned by controlnet"""

        # use autocast for automatic mixed precision (AMP) inference
        with autocast('cuda'):
            for i, t in tqdm(enumerate(timesteps)):
                # duplicate image latents to do classifier free guidance
                latent_model_input = torch.cat([latents] * 2)
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                control_model_input = latents
                controlnet_prompt_embeds = prompt_embeds

                # get output from the control net blocks
                down_block_res_samples, mid_block_res_sample = self.controlnet(
                    control_model_input,
                    t,
                    encoder_hidden_states=controlnet_prompt_embeds,
                    controlnet_cond=controlnet_image,
                    conditioning_scale=1.0,
                    return_dict=False,
                )

                # predict noise residuals
                with torch.no_grad():
                    noise_pred = self.unet(
                        latent_model_input,
                        t,
                        encoder_hidden_states=prompt_embeds,
                        down_block_additional_residuals=down_block_res_samples,
                        mid_block_additional_residual=mid_block_res_sample,
                    )['sample']

                # separate predictions for unconditional and conditional outputs
                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)

                # perform guidance
                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

                # remove the noise from the current sample i.e. go from x_t to x_{t-1}
                latents = self.scheduler.step(noise_pred, t, latents)['prev_sample']

        return latents



    def prepare_controlnet_image(self,
                                 image,
                                 height,
                                 width):
        """preprocesses the controlnet image"""

        # process the image
        image = self.control_image_processor.preprocess(image, height, width).to(dtype=torch.float32)
        # send image to CUDA
        image = image.to(self.device)
        # repeat the image for classifier free guidance
        image = torch.cat([image] * 2)
        return image



    def __call__(self,
                 prompt,
                 image,
                 num_inference_steps=20,
                 guidance_scale=7.5,
                 height=512, width=512):
        """generates new image based on the `prompt` and the `image`"""

        # encode input prompt
        prompt_embeds = self.get_prompt_embeds(prompt)

        # prepare image for controlnet
        controlnet_image = self.prepare_controlnet_image(image, height, width)
        height, width = controlnet_image.shape[-2:]

        # prepare timesteps
        self.scheduler.set_timesteps(num_inference_steps)
        timesteps = self.scheduler.timesteps

        # prepare the initial image in the latent space (noise on which we will do reverse diffusion)
        num_channels_latents = self.unet.config.in_channels
        batch_size = prompt_embeds.shape[0] // 2
        latents = self.get_initial_latents(height, width, num_channels_latents, batch_size)

        # denoise latents
        latents = self.denoise_latents(prompt_embeds,
                                       controlnet_image,
                                       timesteps,
                                       latents,
                                       guidance_scale)

        # decode latents to get the image into pixel space
        latents = latents.to(torch.float16) # change dtype of latents since
        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]

        # convert to PIL Image format
        image = image.detach() # detach to remove any computed gradients
        image = self.transform_image(image)

        return image

# %%
# We can get all the components from the ControlNet Diffusion Pipeline (the one implemented by Hugging Face as well)
vae = pipe.vae
tokenizer = pipe.tokenizer
text_encoder = pipe.text_encoder
unet = pipe.unet
controlnet = pipe.controlnet
scheduler = pipe.scheduler
image_processor = pipe.image_processor
control_image_processor = pipe.control_image_processor

# %%
custom_pipe = ControlNetDiffusionPipelineCustom(vae, tokenizer, text_encoder, unet, controlnet, scheduler, image_processor, control_image_processor)

# %%
# sample image 1
images_custom = custom_pipe("a fashion model wearing a beautiful dress", image_pose, num_inference_steps=20)
images_custom[0]

# %%
# sample image 2
images_custom = custom_pipe("A male fashion model posing in a museum", image_pose, num_inference_steps=20)
images_custom[0]

# %%
# sample image with a different prompt
images_custom = custom_pipe("A professional ice skater wearing a dark blue jacket around sunset, realistic, UHD", image_pose, num_inference_steps=20)
images_custom[0]

# %%


# %%


# %% [markdown]
# # Canny

# %%
import cv2
from PIL import Image
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
import torch
import numpy as np
from diffusers.utils import load_image

# %%
# load the controlnet model for canny edge detection
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16
)

# load the stable diffusion pipeline with controlnet
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

# %%
# enable efficient implementations using xformers for faster inference
pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()

# %%
image_input = load_image("https://cdn.pixabay.com/photo/2023/06/03/16/05/spotted-laughingtrush-8037974_640.png")
image_input = np.array(image_input)

Image.fromarray(image_input)

# %%
# define parameters from canny edge detection
low_threshold = 100
high_threshold = 200

# do canny edge detection
image_canny = cv2.Canny(image_input, low_threshold, high_threshold)

# convert to PIL image format
image_canny = image_canny[:, :, None]
image_canny = np.concatenate([image_canny, image_canny, image_canny], axis=2)
image_canny = Image.fromarray(image_canny)

image_canny

# %%
image_output = pipe("bird", image_canny, num_inference_steps=20).images[0]
image_output

# %%
image_output = pipe("a cute blue bird with colorful aesthetic feathers", image_canny, num_inference_steps=20).images[0]
image_output

# %%


# %% [markdown]
# # Depth

# %%
from transformers import pipeline
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from PIL import Image
import numpy as np
import torch
from diffusers.utils import load_image

# %%
# load the depth estimator model
depth_estimator = pipeline('depth-estimation')

# load the controlnet model for depth estimation
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16
)

# load the stable diffusion pipeline with controlnet
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

# %%
# enable efficient implementations using xformers for faster inference
pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()

# %%
image_input = load_image("https://huggingface.co/lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
image_input

# %%
# get depth estimates
image_depth = depth_estimator(image_input)['depth']

# convert to PIL image format
image_depth = np.array(image_depth)
image_depth = image_depth[:, :, None]
image_depth = np.concatenate([image_depth, image_depth, image_depth], axis=2)
image_depth = Image.fromarray(image_depth)

image_depth

# %%
image_output = pipe("Darth Vader giving lecture", image_depth, num_inference_steps=20).images[0]
image_output

# %%
image_output = pipe("A realistic, aesthetic portrait style photograph of Darth Vader giving lecture, 8k, unreal engine", image_depth, num_inference_steps=20).images[0]
image_output

# %% [markdown]
# # Normal

# %%
from PIL import Image
from transformers import pipeline
import numpy as np
import cv2
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
import torch
from diffusers.utils import load_image

# %%
# load the Dense Prediction Transformer (DPT) model for getting normal maps
depth_estimator = pipeline("depth-estimation", model ="Intel/dpt-hybrid-midas")

# load the controlnet model for normal maps
controlnet = ControlNetModel.from_pretrained(
    "fusing/stable-diffusion-v1-5-controlnet-normal", torch_dtype=torch.float16
)

# load the stable diffusion pipeline with controlnet
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

# %%
# enable efficient implementations using xformers for faster inference
pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()

# %%
image_input = load_image("https://cdn.pixabay.com/photo/2023/06/07/13/02/butterfly-8047187_1280.jpg")
image_input

# %%
# do all the preprocessing to get the normal image
image = depth_estimator(image_input)['predicted_depth'][0]

image = image.numpy()

image_depth = image.copy()
image_depth -= np.min(image_depth)
image_depth /= np.max(image_depth)

bg_threhold = 0.4

x = cv2.Sobel(image, cv2.CV_32F, 1, 0, ksize=3)
x[image_depth < bg_threhold] = 0

y = cv2.Sobel(image, cv2.CV_32F, 0, 1, ksize=3)
y[image_depth < bg_threhold] = 0

z = np.ones_like(x) * np.pi * 2.0

image = np.stack([x, y, z], axis=2)
image /= np.sum(image ** 2.0, axis=2, keepdims=True) ** 0.5
image = (image * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
image_normal = Image.fromarray(image)

image_normal

# %%
image_output = pipe("A colorful butterfly sitting on apples", image_normal, num_inference_steps=20).images[0]
image_output

# %%
image_output = pipe("A beautiful design", image_normal, num_inference_steps=20).images[0]
image_output

# %% [markdown]
# # Segmentation

# %%
from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
from PIL import Image
import numpy as np
import torch
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from diffusers.utils import load_image

# %%
# load the image processor and the model for doing segmentation
image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")

# load the controlnet model for semantic segmentation
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-seg", torch_dtype=torch.float16
)

# load the stable diffusion pipeline with controlnet
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

# %%
# enable efficient implementations using xformers for faster inference
pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()

# %%
# define color palette that is used by the semantic segmentation models

palette = np.asarray([
    [0, 0, 0],
    [120, 120, 120],
    [180, 120, 120],
    [6, 230, 230],
    [80, 50, 50],
    [4, 200, 3],
    [120, 120, 80],
    [140, 140, 140],
    [204, 5, 255],
    [230, 230, 230],
    [4, 250, 7],
    [224, 5, 255],
    [235, 255, 7],
    [150, 5, 61],
    [120, 120, 70],
    [8, 255, 51],
    [255, 6, 82],
    [143, 255, 140],
    [204, 255, 4],
    [255, 51, 7],
    [204, 70, 3],
    [0, 102, 200],
    [61, 230, 250],
    [255, 6, 51],
    [11, 102, 255],
    [255, 7, 71],
    [255, 9, 224],
    [9, 7, 230],
    [220, 220, 220],
    [255, 9, 92],
    [112, 9, 255],
    [8, 255, 214],
    [7, 255, 224],
    [255, 184, 6],
    [10, 255, 71],
    [255, 41, 10],
    [7, 255, 255],
    [224, 255, 8],
    [102, 8, 255],
    [255, 61, 6],
    [255, 194, 7],
    [255, 122, 8],
    [0, 255, 20],
    [255, 8, 41],
    [255, 5, 153],
    [6, 51, 255],
    [235, 12, 255],
    [160, 150, 20],
    [0, 163, 255],
    [140, 140, 140],
    [250, 10, 15],
    [20, 255, 0],
    [31, 255, 0],
    [255, 31, 0],
    [255, 224, 0],
    [153, 255, 0],
    [0, 0, 255],
    [255, 71, 0],
    [0, 235, 255],
    [0, 173, 255],
    [31, 0, 255],
    [11, 200, 200],
    [255, 82, 0],
    [0, 255, 245],
    [0, 61, 255],
    [0, 255, 112],
    [0, 255, 133],
    [255, 0, 0],
    [255, 163, 0],
    [255, 102, 0],
    [194, 255, 0],
    [0, 143, 255],
    [51, 255, 0],
    [0, 82, 255],
    [0, 255, 41],
    [0, 255, 173],
    [10, 0, 255],
    [173, 255, 0],
    [0, 255, 153],
    [255, 92, 0],
    [255, 0, 255],
    [255, 0, 245],
    [255, 0, 102],
    [255, 173, 0],
    [255, 0, 20],
    [255, 184, 184],
    [0, 31, 255],
    [0, 255, 61],
    [0, 71, 255],
    [255, 0, 204],
    [0, 255, 194],
    [0, 255, 82],
    [0, 10, 255],
    [0, 112, 255],
    [51, 0, 255],
    [0, 194, 255],
    [0, 122, 255],
    [0, 255, 163],
    [255, 153, 0],
    [0, 255, 10],
    [255, 112, 0],
    [143, 255, 0],
    [82, 0, 255],
    [163, 255, 0],
    [255, 235, 0],
    [8, 184, 170],
    [133, 0, 255],
    [0, 255, 92],
    [184, 0, 255],
    [255, 0, 31],
    [0, 184, 255],
    [0, 214, 255],
    [255, 0, 112],
    [92, 255, 0],
    [0, 224, 255],
    [112, 224, 255],
    [70, 184, 160],
    [163, 0, 255],
    [153, 0, 255],
    [71, 255, 0],
    [255, 0, 163],
    [255, 204, 0],
    [255, 0, 143],
    [0, 255, 235],
    [133, 255, 0],
    [255, 0, 235],
    [245, 0, 255],
    [255, 0, 122],
    [255, 245, 0],
    [10, 190, 212],
    [214, 255, 0],
    [0, 204, 255],
    [20, 0, 255],
    [255, 255, 0],
    [0, 153, 255],
    [0, 41, 255],
    [0, 255, 204],
    [41, 0, 255],
    [41, 255, 0],
    [173, 0, 255],
    [0, 245, 255],
    [71, 0, 255],
    [122, 0, 255],
    [0, 255, 184],
    [0, 92, 255],
    [184, 255, 0],
    [0, 133, 255],
    [255, 214, 0],
    [25, 194, 194],
    [102, 255, 0],
    [92, 0, 255],
])

# %%
image_input = load_image("https://cdn.pixabay.com/photo/2023/02/24/07/14/crowd-7810353_1280.jpg")
image_input

# %%
# get the pixel values
pixel_values = image_processor(image_input, return_tensors="pt").pixel_values

# do semantic segmentation
with torch.no_grad():
  outputs = image_segmentor(pixel_values)

# post process the semantic segmentation
seg = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image_input.size[::-1]])[0]

# add colors to the different identified classes
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3
for label, color in enumerate(palette):
    color_seg[seg == label, :] = color

# convert into PIL image format
color_seg = color_seg.astype(np.uint8)
image_seg = Image.fromarray(color_seg)

image_seg

# %%
image_output = pipe("A crowd of people staring at a glorious painting", image_seg, num_inference_steps=20).images[0]
image_output

# %%
image_output = pipe("Aliens looking at earth from inside their spaceship from a window, not creepy, not scary, not gross, octane render, smooth", image_seg, num_inference_steps=20).images[0]
image_output

# %%