Kandinsky 2.2

kandinsky-community
Texto a imagen

Kandinsky hereda las mejores prácticas de Dall-E 2 y difusión latente mientras introduce algunas nuevas ideas. Utiliza el modelo CLIP como un codificador de texto e imagen, y un prior de imagen de difusión (mapeo) entre los espacios latentes de las modalidades CLIP. Este enfoque aumenta el rendimiento visual del modelo y revela nuevos horizontes en la fusión de imágenes y manipulación de imágenes guiada por texto. El modelo Kandinsky es creado por Arseniy Shakhmatov, Anton Razzhigaev, Aleksandr Nikolich, Igor Pavlov, Andrey Kuznetsov y Denis Dimitrov.

Como usar

Texto a imagen

from diffusers import AutoPipelineForText2Image
import torch

pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
pipe = pipe.to("cuda")

prompt = "portrait of a young woman, blue eyes, cinematic"
negative_prompt = "low quality, bad quality"

image = pipe(prompt=prompt, negative_prompt=negative_prompt, prior_guidance_scale=1.0, height=768, width=768).images[0]
image.save("portrait.png")

Generación de imagen a imagen guiada por texto

from PIL import Image
import requests
from io import BytesIO

url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
response = requests.get(url)
original_image = Image.open(BytesIO(response.content)).convert("RGB")
original_image = original_image.resize((768, 512))

from diffusers import AutoPipelineForImage2Image
import torch

pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()

prompt = "A fantasy landscape, Cinematic lighting"
negative_prompt = "low quality, bad quality"

image = pipe(prompt=prompt, image=original_image, strength=0.3, height=768, width=768).images[0]
image.save("fantasy_land.png")

Interpolación

from diffusers import KandinskyV22PriorPipeline, KandinskyV22Pipeline
from diffusers.utils import load_image
import PIL
import torch

pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
)
pipe_prior.to("cuda")

img1 = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
)
img2 = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/starry_night.jpeg"
)

images_texts = ["a cat", img1, img2]
weights = [0.3, 0.3, 0.4]

prompt = ""
prior_out = pipe_prior.interpolate(images_texts, weights)

pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
pipe.to("cuda")

image = pipe(**prior_out, height=768, width=768).images[0]
image.save("starry_cat.png")

Generación de inpainting guiada por texto

from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image
import torch
import numpy as np

pipe = AutoPipelineForInpainting.from_pretrained("kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()

prompt = "a hat"

init_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
)

mask = np.zeros((768, 768), dtype=np.float32)
mask[:250, 250:-250] = 1

out = pipe(
prompt=prompt,
image=init_image,
mask_image=mask,
height=768,
width=768,
num_inference_steps=150,
)

image = out.images[0]
image.save("cat_with_hat.png")

Generación de texto a imagen con condicionamiento de ControlNet

import torch
import numpy as np
from transformers import pipeline
from diffusers.utils import load_image
from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline

def make_hint(image, depth_estimator):
    image = depth_estimator(image)["depth"]
    image = np.array(image)
    image = image[:, :, None]
    image = np.concatenate([image, image, image], axis=2)
    detected_map = torch.from_numpy(image).float() / 255.0
    hint = detected_map.permute(2, 0, 1)
    return hint

img = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png").resize((768, 768))

# Utilizamos el pipeline de 'depth-estimation' de transformers para procesar la imagen y obtener su mapa de profundidad

depth_estimator = pipeline("depth-estimation")
hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")

pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
)
pipe_prior.to("cuda")

pipe = KandinskyV22ControlnetPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
)
pipe.to("cuda")

prompt = "A robot, 4k photo"
negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"

generator = torch.Generator(device="cuda").manual_seed(43)
image_emb, zero_image_emb = pipe_prior(
prompt=prompt, negative_prompt=negative_prior_prompt, generator=generator
).to_tuple()

images = pipe(
image_embeds=image_emb,
negative_image_embeds=zero_image_emb,
hint=hint,
num_inference_steps=50,
generator=generator,
height=768,
width=768,
).images
images[0].save("robot_cat.png")

Generación de imagen a imagen con condicionamiento de ControlNet

import torch
import numpy as np
from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline
from diffusers.utils import load_image
from transformers import pipeline

img = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png").resize((768, 768))

def make_hint(image, depth_estimator):
    image = depth_estimator(image)["depth"]
    image = np.array(image)
    image = image[:, :, None]
    image = np.concatenate([image, image, image], axis=2)
    detected_map = torch.from_numpy(image).float() / 255.0
    hint = detected_map.permute(2, 0, 1)
    return hint

depth_estimator = pipeline("depth-estimation")
hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")

pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
)
pipe_prior.to("cuda")

pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
)
pipe.to("cuda")

prompt = "A robot, 4k photo"
negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"

generator = torch.Generator(device="cuda").manual_seed(43)

img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator)
negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator)

images = pipe(
image=img,
strength=0.5,
image_embeds=img_emb.image_embeds,
negative_image_embeds=negative_emb.image_embeds,
hint=hint,
num_inference_steps=50,
generator=generator,
height=768,
width=768,
).images
images[0].save("robot_cat.png")

Funcionalidades

Modelo de difusión condicional basado en texto
Utiliza CLIP como codificador de texto e imagen
Introduce un prior de imagen de difusión entre los espacios latentes de CLIP
Compatibilidad con diffusers
Arquitectura que incluye un modelo prior basado en transformadores, un modelo de difusión unet y un decodificador
Entrenado en datos de alta resolución y de diversos aspectos

Casos de uso

Generación de imágenes a partir de texto
Interpolación de imágenes
Inpainting guiado por texto
Generación de imágenes con condicionamiento de ControlNet