Kandinsky 2.2
kandinsky-community
Texto a imagen
Kandinsky hereda las mejores prácticas de Dall-E 2 y difusión latente mientras introduce algunas nuevas ideas. Utiliza el modelo CLIP como un codificador de texto e imagen, y un prior de imagen de difusión (mapeo) entre los espacios latentes de las modalidades CLIP. Este enfoque aumenta el rendimiento visual del modelo y revela nuevos horizontes en la fusión de imágenes y manipulación de imágenes guiada por texto. El modelo Kandinsky es creado por Arseniy Shakhmatov, Anton Razzhigaev, Aleksandr Nikolich, Igor Pavlov, Andrey Kuznetsov y Denis Dimitrov.
Como usar
Texto a imagen
from diffusers import AutoPipelineForText2Image
import torch
pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
pipe = pipe.to("cuda")
prompt = "portrait of a young woman, blue eyes, cinematic"
negative_prompt = "low quality, bad quality"
image = pipe(prompt=prompt, negative_prompt=negative_prompt, prior_guidance_scale=1.0, height=768, width=768).images[0]
image.save("portrait.png")
Generación de imagen a imagen guiada por texto
from PIL import Image
import requests
from io import BytesIO
url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
response = requests.get(url)
original_image = Image.open(BytesIO(response.content)).convert("RGB")
original_image = original_image.resize((768, 512))
from diffusers import AutoPipelineForImage2Image
import torch
pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()
prompt = "A fantasy landscape, Cinematic lighting"
negative_prompt = "low quality, bad quality"
image = pipe(prompt=prompt, image=original_image, strength=0.3, height=768, width=768).images[0]
image.save("fantasy_land.png")
Interpolación
from diffusers import KandinskyV22PriorPipeline, KandinskyV22Pipeline
from diffusers.utils import load_image
import PIL
import torch
pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
)
pipe_prior.to("cuda")
img1 = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
)
img2 = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/starry_night.jpeg"
)
images_texts = ["a cat", img1, img2]
weights = [0.3, 0.3, 0.4]
prompt = ""
prior_out = pipe_prior.interpolate(images_texts, weights)
pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16)
pipe.to("cuda")
image = pipe(**prior_out, height=768, width=768).images[0]
image.save("starry_cat.png")
Generación de inpainting guiada por texto
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image
import torch
import numpy as np
pipe = AutoPipelineForInpainting.from_pretrained("kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()
prompt = "a hat"
init_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
)
mask = np.zeros((768, 768), dtype=np.float32)
mask[:250, 250:-250] = 1
out = pipe(
prompt=prompt,
image=init_image,
mask_image=mask,
height=768,
width=768,
num_inference_steps=150,
)
image = out.images[0]
image.save("cat_with_hat.png")
Generación de texto a imagen con condicionamiento de ControlNet
import torch
import numpy as np
from transformers import pipeline
from diffusers.utils import load_image
from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline
def make_hint(image, depth_estimator):
image = depth_estimator(image)["depth"]
image = np.array(image)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
detected_map = torch.from_numpy(image).float() / 255.0
hint = detected_map.permute(2, 0, 1)
return hint
img = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png").resize((768, 768))
# Utilizamos el pipeline de 'depth-estimation' de transformers para procesar la imagen y obtener su mapa de profundidad
depth_estimator = pipeline("depth-estimation")
hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
)
pipe_prior.to("cuda")
pipe = KandinskyV22ControlnetPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
)
pipe.to("cuda")
prompt = "A robot, 4k photo"
negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
generator = torch.Generator(device="cuda").manual_seed(43)
image_emb, zero_image_emb = pipe_prior(
prompt=prompt, negative_prompt=negative_prior_prompt, generator=generator
).to_tuple()
images = pipe(
image_embeds=image_emb,
negative_image_embeds=zero_image_emb,
hint=hint,
num_inference_steps=50,
generator=generator,
height=768,
width=768,
).images
images[0].save("robot_cat.png")
Generación de imagen a imagen con condicionamiento de ControlNet
import torch
import numpy as np
from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline
from diffusers.utils import load_image
from transformers import pipeline
img = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png").resize((768, 768))
def make_hint(image, depth_estimator):
image = depth_estimator(image)["depth"]
image = np.array(image)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
detected_map = torch.from_numpy(image).float() / 255.0
hint = detected_map.permute(2, 0, 1)
return hint
depth_estimator = pipeline("depth-estimation")
hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
)
pipe_prior.to("cuda")
pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
)
pipe.to("cuda")
prompt = "A robot, 4k photo"
negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
generator = torch.Generator(device="cuda").manual_seed(43)
img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator)
negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator)
images = pipe(
image=img,
strength=0.5,
image_embeds=img_emb.image_embeds,
negative_image_embeds=negative_emb.image_embeds,
hint=hint,
num_inference_steps=50,
generator=generator,
height=768,
width=768,
).images
images[0].save("robot_cat.png")
Funcionalidades
- Modelo de difusión condicional basado en texto
- Utiliza CLIP como codificador de texto e imagen
- Introduce un prior de imagen de difusión entre los espacios latentes de CLIP
- Compatibilidad con diffusers
- Arquitectura que incluye un modelo prior basado en transformadores, un modelo de difusión unet y un decodificador
- Entrenado en datos de alta resolución y de diversos aspectos
Casos de uso
- Generación de imágenes a partir de texto
- Interpolación de imágenes
- Inpainting guiado por texto
- Generación de imágenes con condicionamiento de ControlNet