diff --git a/.gitignore b/.gitignore index 8f322f0..d5866ec 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,6 @@ yarn-error.log* # typescript *.tsbuildinfo next-env.d.ts + + +.vscode \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index f91f003..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "typescript.tsdk": "node_modules/typescript/lib", - "workbench.colorCustomizations": { - "activityBar.background": "#023524", - "titleBar.activeBackground": "#034A33", - "titleBar.activeForeground": "#EDFEF8" - } -} \ No newline at end of file diff --git a/app/projects/dedit/assets/arch.png b/app/projects/dedit/assets/arch.png new file mode 100644 index 0000000..081a1b4 Binary files /dev/null and b/app/projects/dedit/assets/arch.png differ diff --git a/app/projects/dedit/assets/img3_text2.jpg b/app/projects/dedit/assets/img3_text2.jpg new file mode 100644 index 0000000..5f6b4ae Binary files /dev/null and b/app/projects/dedit/assets/img3_text2.jpg differ diff --git a/app/projects/dedit/assets/img4_image2.jpg b/app/projects/dedit/assets/img4_image2.jpg new file mode 100644 index 0000000..ca1f770 Binary files /dev/null and b/app/projects/dedit/assets/img4_image2.jpg differ diff --git a/app/projects/dedit/assets/img5_mask.jpg b/app/projects/dedit/assets/img5_mask.jpg new file mode 100644 index 0000000..95dedf6 Binary files /dev/null and b/app/projects/dedit/assets/img5_mask.jpg differ diff --git a/app/projects/dedit/assets/img7_remove.jpg b/app/projects/dedit/assets/img7_remove.jpg new file mode 100644 index 0000000..25c79f1 Binary files /dev/null and b/app/projects/dedit/assets/img7_remove.jpg differ diff --git a/app/projects/dedit/page.mdx b/app/projects/dedit/page.mdx new file mode 100644 index 0000000..b033218 --- /dev/null +++ b/app/projects/dedit/page.mdx @@ -0,0 +1,95 @@ +import { Authors, Badges } from '@/components/utils' + +# An Item is Worth a Prompt: Versatile Image Editing with Disentangled Control + + + + + + +## Introduction +Recent advancements in text-to-image diffusion models have revolutionized image editing by enabling sophisticated control over tasks like inpainting, text-guided editing, and item removal. Despite progress, challenges remain in preserving original image integrity and achieving precise semantic alignment with modifications. To address these, we introduce D-Edit, a versatile framework that disentangles item-prompt interactions using grouped cross-attention and unique item prompts. D-Edit supports text-based, image-based, mask-based editing, and item removal within a unified system, offering unprecedented flexibility and precision for creative and practical editing applications. + + + +## Method + +#### Item-Prompt Association + +The original LDM performs text-image interaction between every token in $c$ and every pixel in $z_t$ through cross-attention matrix $A$. +In fact, such token-pixel interactions have been shown disentangled in nature, and the attention matrix $A\in\mathbb{R}^{Z\times W}$ is usually sparse in the sense that each column (token) only attend to several non-zero rows (pixels). +For example, during image generation, the word "bear" has higher attention scores with pixels related to the bear region compared to the remaining region. + + +Inspired by the natural disentanglement, we propose to segment the given image $I$ into $N$ non-overlapped items $\{I_i \}_{i=1}^{N}$ using segmentation model (same segmentation applied to $z^t$ because of emergent correspondence). +A set of prompts $\{ P_i\}_{i=1}^{N}$ is adopted to replace the original text prompt $P$. +we force different items $I_i$ to be controlled by distinct prompt $P_i$ by masking our other items, and therefore any prompt changes in $P_i$ will not influence the remaining item during the cross-attention controlling flow, which is the desired property for image editing. +This results in a group of disentangled cross-attentions. For each item-prompt pair ($I_i$, $P_i$), the cross-attention can be written as +$$ +q_i=w_q z^t_i \in \mathbb{R}^{Z_i\times D} \quad +k_i = w_k c_i \in \mathbb{R}^{W_i\times D} \quad +v_i = w_v c_i \in \mathbb{R}^{W_i\times D}\\ +\quad + +\text{out}(\{ c_i\}, \{z^t_i\}) = \Sigma_{i=1}^{N} \text{out}_i(c_i, z^t_i) \quad + A_i = \text{softmax}(q_ik_i^T) \in \mathbb{R}^{Z_i\times W_i} \quad + \text{out}(c_i, z^t_i) = A_i\cdot v_i +$$ +It should be noted that such disentangled cross-attention cannot be directly used for pretrained LDMs, and therefore further finetuning is necessary to enable the model to comprehend item prompts and grouped cross-attention. + +![Comparison of conventional full cross-attention and grouped cross-attention. Query, key, and value are shown as one-dimensional vectors. For grouped cross-attention, each item (corresponding to certain pixels/patches) only attends to the text prompt (two tokens) assigned to it.|scale=0.7](./assets/arch.png) + + +#### Linking Prompt to Item + +We link prompts to items with two sequential steps. We first introduce the item prompt, consisting of several special tokens with randomly initialized embeddings. +Then we finetune the model to build the item-prompt association. + + + +##### Prompt Injection +We propose to represent each item in an image with several new tokens which are inserted into the existing vocabulary of text encoder(s). +Specifically, we use 2 tokens to represent each item and initialize the newly added embedding entries using Gaussian distribution with mean and standard deviation derived from the existing vocabulary. +For comparisons, Dreambooth represents the image using rare tokens and +perfect rare tokens should have no interference with existing vocabulary, which is hard to find. +Textual inversion and Imagic insert new tokens into vocabulary where the corresponding embedding is semantically initialized by given word embeddings which describe the image. This adds additional burdens of captioning the original image. +We found that it is sufficient to use randomly initialized new tokens as item prompts and such randomly initialized tokens have minimal impact on the existing vocabularies. + + +To associate items with prompts, the inserted embedding entries are then optimized to reconstruct the corresponding image to be edited using +$$ +\text{min}_e \mathbb{E}_{t,\epsilon}\left[|| \epsilon - f_\theta (z_t, t, g_\Phi(P) )||^2 \right], +$$ +where $e\in\mathbb{R}^{NM\times D_{\text{emb}}}$ +represents the embedding rows corresponding to $N$ items each with $M$ tokens. + + +##### Model Finetuning +Optimization in the first stage injects the image concept into text-encoder(s), but cannot achieve perfect reconstruction of the original item given the corresponding prompt. +Therefore, in the second stage of optimization, we optimize the UNet parameters by running optimization with the same objective function as in Equation. +We found that updating parameters solely within cross-attention layers is adequate, as we only disentangle the forward process of these layers rather than the entire model. +It should be noted that the optimizations above are running against only one image or two images (target and reference images) if image-based editing is needed. + +Editing with Item-Prompt Freestyle. +After the two-step optimization, the model can exactly reconstruct the original image given the set of prompts corresponding to each item, with an appropriate classifier-free guidance scale. +We then achieve various disentangled image editing by changing the prompt associated with an item, the mask of an item-prompt pair, and the mapping between items and prompts. + + + +## Experiments +#### Text-based Editing +![The learned prompt (denoted as [v]) can be combined with words to achieve refinement/editing of the target item. (a) Augment an item prompt with words while keeping other prompts unchanged for editing. (b) Generate the entire image with certain item prompt(s) augmented with text words for personalization.|scale=0.7](./assets/img3_text2.jpg) + +#### Image-based Editing +![Qualitative comparison of image-guided editing. D-Edit is compared with Anydoor, Paint-by-Example, and TF-ICON, on item replacement and face swapping.|scale=0.7](./assets/img4_image2.jpg) +#### Mask-based Editing +![Different types of mask-based editing: (a) Moving/swapping items; (b) reshaping an item; (c) Resizing an item.|scale=0.7](./assets/img5_mask.jpg) +#### Item Removal +![Removing items one by one from the image |scale=0.7](./assets/img7_remove.jpg) \ No newline at end of file diff --git a/config/publications.ts b/config/publications.ts index a6429ec..4af1dfd 100644 --- a/config/publications.ts +++ b/config/publications.ts @@ -29,6 +29,20 @@ export const publications: Publication[] = [ impact: "Beyond theoretical guarantees, we demonstrate the improvements achieved by LResNet in building hyperbolic deep learning models, where we conduct extensive experiments to show its superior performance in graph and image modalities across CNNs, GNNs, and graph Transformers.", tags: [Tag.MultiModalFoundationModel], }, + { + title: "D-Edit: An Item is Worth a Prompt: Versatile Image Editing with Disentangled Control", + authors: + "Aosong Feng, Weikang Qiu, Jinbin Bai,Xiao Zhang, Zhen Dong, Kaicheng Zhou, Rex Ying, and Leandros Tassiulas", + venue: "AAAI 2025", + page: "dedit", + paper: "https://arxiv.org/abs/2403.04880", + code: "https://github.com/collovlabs/d-edit", + tags: [Tag.Applications], + abstract: + "D-Edit is a novel framework for diffusion-based image editing framework that disentangles image-prompt into item-prompt associations, enabling precise and harmonious edits across image, achieving state-of-the-art results in a unified, versatile approach.", + impact: + "The proposed method is a unified editing framework that supports image-based, text-based, mask-based editing, and item removal within a single cohesive system.", + }, { title: "Protein-Nucleic Acid Complex Modeling with Frame Averaging Transformer", authors: "Tinglin Huang, Zhenqiao Song, Rex Ying, Wengong Jin",