diff --git a/docs/source/en/api/pipelines/cosmos3.md b/docs/source/en/api/pipelines/cosmos3.md
index 7ce1ff4f58cf..1ac8f36457a4 100644
--- a/docs/source/en/api/pipelines/cosmos3.md
+++ b/docs/source/en/api/pipelines/cosmos3.md
@@ -77,6 +77,8 @@ python -m cosmos_framework.inference.prompt_upsampling \
Switch `--mode` to match the workflow you are targeting (`text2image`, `text2video`, `image2video`). The command writes the upsampled prompt(s) to the `--output` file as a JSON array (one object per non-empty line in `--input`); pass a `.jsonl` path instead to get one JSON object per line. For `image2video`, you must also supply the conditioning image via `--image-url` (a URL or local path) or `--image-list` (one image per prompt).
+
+
A pre-upsampled positive prompt (`assets/example_t2v_prompt.json`) and negative prompt (`assets/negative_prompt.json`) are provided for convenience, and are used by the generation examples below. The examples load these JSON files and pass them to the pipeline as JSON strings via `json.dumps(...)`.
## Text-to-video
@@ -276,6 +278,200 @@ export_to_video(result.video, "cosmos3_i2v.mp4", fps=24, macro_block_size=1)
+## Video-to-video
+
+Pass a conditioning clip via `video=` (e.g. from `load_video`). The pipeline anchors the leading latent frames given by `condition_frame_indexes_vision` (default `[0, 1]`) to the clip and denoises the rest. Use `condition_video_keep` (`"first"` or `"last"`) to choose which end of a longer source clip the conditioning frames are taken from. As with the other modes, the prompt should follow the descriptive JSON structure described in [Prompt upsampling](#prompt-upsampling).
+
+
+
+
+
+
+```python
+import json
+import torch
+from diffusers import Cosmos3OmniPipeline
+from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+from diffusers.utils import export_to_video, load_video
+
+# JSON-upsampled positive and negative prompts (see "Prompt upsampling" above).
+json_prompt = json.load(open("assets/example_v2v_prompt.json"))
+negative_prompt = json.load(open("assets/negative_prompt_i2v.json"))
+
+pipe = Cosmos3OmniPipeline.from_pretrained(
+ "nvidia/Cosmos3-Nano", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+pipe.scheduler = UniPCMultistepScheduler.from_config(
+ pipe.scheduler.config, flow_shift=10.0, use_karras_sigmas=False
+)
+
+video = load_video(
+ "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_pouring.mp4"
+)
+
+result = pipe(
+ prompt=json.dumps(json_prompt),
+ negative_prompt=json.dumps(negative_prompt),
+ video=video,
+ condition_frame_indexes_vision=[0, 1],
+ condition_video_keep="first",
+ num_frames=189,
+ height=720,
+ width=1280,
+ num_inference_steps=35,
+ guidance_scale=6.0,
+ fps=24.0,
+)
+# macro_block_size=1 allows arbitrary frame sizes (Cosmos3 outputs are not always divisible by 16).
+export_to_video(result.video, "cosmos3_v2v.mp4", fps=24, macro_block_size=1)
+```
+
+
+
+
+```python
+import json
+import torch
+from diffusers import Cosmos3OmniPipeline
+from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+from diffusers.utils import export_to_video, load_video
+
+# JSON-upsampled positive and negative prompts (see "Prompt upsampling" above).
+json_prompt = json.load(open("assets/example_v2v_prompt.json"))
+negative_prompt = json.load(open("assets/negative_prompt_i2v.json"))
+
+pipe = Cosmos3OmniPipeline.from_pretrained(
+ "nvidia/Cosmos3-Super", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+pipe.scheduler = UniPCMultistepScheduler.from_config(
+ pipe.scheduler.config, flow_shift=10.0, use_karras_sigmas=False
+)
+
+video = load_video(
+ "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_pouring.mp4"
+)
+
+result = pipe(
+ prompt=json.dumps(json_prompt),
+ negative_prompt=json.dumps(negative_prompt),
+ video=video,
+ condition_frame_indexes_vision=[0, 1],
+ condition_video_keep="first",
+ num_frames=189,
+ height=720,
+ width=1280,
+ num_inference_steps=35,
+ guidance_scale=6.0,
+ fps=24.0,
+)
+# macro_block_size=1 allows arbitrary frame sizes (Cosmos3 outputs are not always divisible by 16).
+export_to_video(result.video, "cosmos3_v2v.mp4", fps=24, macro_block_size=1)
+```
+
+
+
+
+## Video-to-video with sound
+
+When the checkpoint carries a `sound_tokenizer`, add `enable_sound=True` to the video-to-video call to jointly generate a synchronized audio track. The waveform is returned alongside the video and can be muxed into the MP4 with [`~utils.encode_video`].
+
+
+
+
+```python
+import json
+import torch
+from diffusers import Cosmos3OmniPipeline
+from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+from diffusers.utils import encode_video, load_video
+
+# JSON-upsampled positive and negative prompts (see "Prompt upsampling" above).
+json_prompt = json.load(open("assets/example_v2v_prompt.json"))
+negative_prompt = json.load(open("assets/negative_prompt_i2v.json"))
+
+pipe = Cosmos3OmniPipeline.from_pretrained(
+ "nvidia/Cosmos3-Nano", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+pipe.scheduler = UniPCMultistepScheduler.from_config(
+ pipe.scheduler.config, flow_shift=10.0, use_karras_sigmas=False
+)
+
+video = load_video(
+ "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_pouring.mp4"
+)
+
+result = pipe(
+ prompt=json.dumps(json_prompt),
+ negative_prompt=json.dumps(negative_prompt),
+ video=video,
+ condition_frame_indexes_vision=[0, 1],
+ condition_video_keep="first",
+ num_frames=189,
+ height=720,
+ width=1280,
+ fps=24.0,
+ enable_sound=True,
+)
+
+encode_video(
+ result.video,
+ fps=24,
+ audio=result.sound,
+ audio_sample_rate=pipe.sound_tokenizer.config.sampling_rate,
+ output_path="cosmos3_v2v_with_sound.mp4",
+)
+```
+
+
+
+
+```python
+import json
+import torch
+from diffusers import Cosmos3OmniPipeline
+from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+from diffusers.utils import encode_video, load_video
+
+# JSON-upsampled positive and negative prompts (see "Prompt upsampling" above).
+json_prompt = json.load(open("assets/example_v2v_prompt.json"))
+negative_prompt = json.load(open("assets/negative_prompt_i2v.json"))
+
+pipe = Cosmos3OmniPipeline.from_pretrained(
+ "nvidia/Cosmos3-Super", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+pipe.scheduler = UniPCMultistepScheduler.from_config(
+ pipe.scheduler.config, flow_shift=10.0, use_karras_sigmas=False
+)
+
+video = load_video(
+ "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_pouring.mp4"
+)
+
+result = pipe(
+ prompt=json.dumps(json_prompt),
+ negative_prompt=json.dumps(negative_prompt),
+ video=video,
+ condition_frame_indexes_vision=[0, 1],
+ condition_video_keep="first",
+ num_frames=189,
+ height=720,
+ width=1280,
+ fps=24.0,
+ enable_sound=True,
+)
+
+encode_video(
+ result.video,
+ fps=24,
+ audio=result.sound,
+ audio_sample_rate=pipe.sound_tokenizer.config.sampling_rate,
+ output_path="cosmos3_v2v_with_sound.mp4",
+)
+```
+
+
+
+
## Text-to-video with sound
When the checkpoint carries a `sound_tokenizer`, pass `enable_sound=True` to jointly generate a synchronized audio track. The waveform is returned alongside the video and can be muxed into the MP4 with [`~utils.encode_video`].
diff --git a/examples/cosmos3/README.md b/examples/cosmos3/README.md
index dd4be5dc286f..bca9edc7aa6d 100644
--- a/examples/cosmos3/README.md
+++ b/examples/cosmos3/README.md
@@ -40,6 +40,16 @@ python examples/cosmos3/inference_cosmos3.py \
--vision-path https://github.com/nvidia-cosmos/cosmos-dependencies/releases/download/assets/robot_153.jpg
```
+Video-to-video (condition on the leading frames of a clip and continue it):
+
+```bash
+python examples/cosmos3/inference_cosmos3.py \
+ --prompt "A robotic arm finishes pouring liquid into the glass." \
+ --video-path "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_pouring.mp4" \
+ --condition-frame-indexes-vision 0,1 \
+ --condition-video-keep first
+```
+
Text-to-video-with-sound (sound-capable checkpoint only):
```bash
diff --git a/examples/cosmos3/inference_cosmos3.py b/examples/cosmos3/inference_cosmos3.py
index e9a5f5f369bb..62388c8d1288 100644
--- a/examples/cosmos3/inference_cosmos3.py
+++ b/examples/cosmos3/inference_cosmos3.py
@@ -18,6 +18,9 @@
Image-to-video:
python inference_cosmos3.py --prompt "..." --vision-path /path/to/image.jpg
+Video-to-video:
+ python inference_cosmos3.py --prompt "..." --video-path /path/to/video.mp4
+
Text-to-video-with-sound (requires a sound-capable checkpoint):
python inference_cosmos3.py --prompt "..." --enable-sound
"""
@@ -70,6 +73,22 @@ def main():
default=None,
help="Optional URL or local path for an image-conditioning frame, or an action conditioning video.",
)
+ parser.add_argument(
+ "--video-path",
+ default=None,
+ help="Optional URL or local path to a conditioning video for video-to-video generation.",
+ )
+ parser.add_argument(
+ "--condition-frame-indexes-vision",
+ default=None,
+ help="Comma-separated latent frame indexes kept clean for video-to-video (default: 0,1).",
+ )
+ parser.add_argument(
+ "--condition-video-keep",
+ choices=["first", "last"],
+ default="first",
+ help="Take the video-to-video conditioning frames from the first or last of the source clip (default: first).",
+ )
parser.add_argument("--output", default=".", help="Directory to save generated video/image/audio files.")
parser.add_argument(
"--height",
@@ -206,6 +225,30 @@ def main():
add_duration_template=args.add_duration_template,
enable_safety_check=not args.no_safety_check,
)
+ elif args.video_path is not None:
+ video = load_video(args.video_path)
+ condition_frame_indexes_vision = (
+ [int(i) for i in args.condition_frame_indexes_vision.split(",") if i.strip()]
+ if args.condition_frame_indexes_vision is not None
+ else [0, 1]
+ )
+ result = pipeline(
+ prompt=args.prompt,
+ video=video,
+ condition_frame_indexes_vision=condition_frame_indexes_vision,
+ condition_video_keep=args.condition_video_keep,
+ num_frames=args.num_frames,
+ height=args.height,
+ width=args.width,
+ fps=args.fps,
+ num_inference_steps=args.num_inference_steps,
+ enable_sound=args.enable_sound,
+ guidance_scale=args.guidance_scale,
+ generator=generator,
+ add_resolution_template=args.add_resolution_template,
+ add_duration_template=args.add_duration_template,
+ enable_safety_check=not args.no_safety_check,
+ )
else:
image = load_image(args.vision_path) if args.vision_path is not None else None
result = pipeline(
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos3_omni.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos3_omni.py
index 5425b7b575eb..538b553d478d 100644
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos3_omni.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos3_omni.py
@@ -15,6 +15,7 @@
import copy
import json
import math
+from collections.abc import Iterable
from dataclasses import dataclass
from typing import Any, Callable, Literal
@@ -704,6 +705,9 @@ def _remove_action_video_padding_from_latent(
def prepare_latents(
self,
image: torch.Tensor | None = None,
+ video: list[Image.Image] | torch.Tensor | np.ndarray | None = None,
+ condition_frame_indexes_vision: Iterable[int] = (0, 1),
+ condition_video_keep: Literal["first", "last"] = "first",
num_frames: int | None = None,
height: int | None = None,
width: int | None = None,
@@ -737,6 +741,8 @@ def prepare_latents(
action_mode = action.mode if action is not None else None
is_image = num_frames == 1
has_image_condition = (image is not None and not is_image) or action_mode is not None
+ # Video-to-video conditioning: a top-level `video` without an action run.
+ has_video_condition = video is not None and action is None
# video_processor.preprocess handles PIL/np/tensor → [1, 3, H, W] in [-1, 1], resized to (height, width).
conditioning_frame_2d: torch.Tensor | None = None
@@ -745,6 +751,19 @@ def prepare_latents(
device=device, dtype=dtype
)
+ conditioning_frames_3d: torch.Tensor | None = None
+ condition_indexes_vision: tuple[int, ...] = tuple(condition_frame_indexes_vision)
+ if has_video_condition:
+ conditioning_frames_3d = self.video_processor.preprocess_video(video, height=height, width=width).to(
+ device=device, dtype=dtype
+ )
+ temporal_compression = int(self.vae.config.scale_factor_temporal)
+ max_cond_frames = max(condition_indexes_vision) * temporal_compression + 1
+ if condition_video_keep == "first":
+ conditioning_frames_3d = conditioning_frames_3d[:, :, :max_cond_frames]
+ else:
+ conditioning_frames_3d = conditioning_frames_3d[:, :, -max_cond_frames:]
+
action_domain_id: torch.Tensor | None = None
action_condition_mask: torch.Tensor | None = None
raw_action_dim_resolved: int | None = (
@@ -789,7 +808,17 @@ def prepare_latents(
)
else:
vision_tensor = torch.zeros(1, 3, num_frames, height, width, dtype=dtype, device=device)
- if conditioning_frame_2d is not None:
+ if conditioning_frames_3d is not None:
+ # Video-to-video: place the leading conditioning frames at the start, repeat-pad the tail with the
+ # last conditioning frame, then mark the conditioned latent indexes clean (encoded as a whole below).
+ t_fill = min(conditioning_frames_3d.shape[2], num_frames)
+ vision_tensor[:, :, :t_fill] = conditioning_frames_3d[:, :, :t_fill]
+ if t_fill < num_frames:
+ vision_tensor[:, :, t_fill:] = vision_tensor[:, :, t_fill - 1 : t_fill].expand(
+ -1, -1, num_frames - t_fill, -1, -1
+ )
+ vision_condition_frames = list(condition_indexes_vision)
+ elif conditioning_frame_2d is not None:
# Single conditioning frame at t=0, repeat-pad the rest with that same frame.
vision_tensor[:, :, 0] = conditioning_frame_2d
if num_frames > 1:
@@ -928,6 +957,8 @@ def check_inputs(
enable_sound: bool,
callback_on_step_end_tensor_inputs: list[str],
action: "CosmosActionCondition | None" = None,
+ video: list[Image.Image] | torch.Tensor | np.ndarray | None = None,
+ condition_frame_indexes_vision: Iterable[int] = (0, 1),
) -> None:
if not isinstance(prompt, (str, list)) or (
isinstance(prompt, list) and not all(isinstance(p, str) for p in prompt)
@@ -958,6 +989,8 @@ def check_inputs(
raise ValueError(
"Pass action conditioning via `action.image` / `action.video`, not the top-level `image` argument."
)
+ if video is not None:
+ raise ValueError("Pass action conditioning via `action.video`, not the top-level `video` argument.")
if not getattr(self.transformer.config, "action_gen", False):
raise ValueError("`action` requires a transformer trained with action_gen=True.")
if action.mode == "forward_dynamics" and action.raw_actions is not None:
@@ -976,6 +1009,27 @@ def check_inputs(
sf = int(self.vae.config.scale_factor_spatial)
if height % sf != 0 or width % sf != 0:
raise ValueError(f"`height` and `width` must be multiples of {sf}, got ({height}, {width}).")
+ if image is not None and video is not None:
+ raise ValueError("Pass either `image` (image-to-video) or `video` (video-to-video), not both.")
+ if video is not None:
+ if num_frames == 1:
+ raise ValueError("`video` conditioning requires `num_frames` > 1.")
+ if isinstance(condition_frame_indexes_vision, (str, bytes)) or not all(
+ isinstance(index, int) and index >= 0 for index in condition_frame_indexes_vision
+ ):
+ raise ValueError(
+ f"`condition_frame_indexes_vision` must be a list of non-negative ints, e.g. [0, 1]; got "
+ f"{condition_frame_indexes_vision!r}."
+ )
+ indexes = tuple(condition_frame_indexes_vision)
+ if not indexes:
+ raise ValueError("`condition_frame_indexes_vision` must contain at least one index.")
+ latent_t = (num_frames - 1) // int(self.vae.config.scale_factor_temporal) + 1
+ if max(indexes) >= latent_t:
+ raise ValueError(
+ f"`condition_frame_indexes_vision` {indexes} contains an index outside the latent timeline "
+ f"(latent_frames={latent_t} for num_frames={num_frames})."
+ )
@staticmethod
def _build_action_json_prompt(
@@ -1198,6 +1252,9 @@ def __call__(
prompt: str | list[str],
negative_prompt: str | list[str] | None = None,
image: torch.Tensor | None = None,
+ video: list[Image.Image] | torch.Tensor | np.ndarray | None = None,
+ condition_frame_indexes_vision: Iterable[int] = (0, 1),
+ condition_video_keep: Literal["first", "last"] = "first",
num_frames: int | None = None,
height: int | None = None,
width: int | None = None,
@@ -1223,9 +1280,13 @@ def __call__(
enable_safety_check: bool = True,
) -> Cosmos3OmniPipelineOutput:
r"""
- Run the Cosmos 3 omni pipeline end-to-end: encode the (optional) conditioning image, denoise vision and
+ Run the Cosmos 3 omni pipeline end-to-end: encode the (optional) conditioning image/video, denoise vision and
(optional) sound latents jointly, and decode them back into a video and audio waveform.
+ The generation mode is selected from the inputs: text-to-image when `num_frames == 1`, image-to-video when
+ `image` is supplied, video-to-video (generation) when `video` is supplied (without `action`),
+ action-conditioned generation when `action` is supplied, and text-to-video otherwise.
+
Args:
prompt (`str` or `List[str]`):
The prompt to guide generation. Lists are collapsed to the first entry — the pipeline runs one sample
@@ -1235,6 +1296,20 @@ def __call__(
image (`torch.Tensor` or `PIL.Image.Image`, *optional*):
Optional conditioning frame for image-to-video. The pipeline anchors frame 0 to this image and denoises
the remaining frames. Ignored when `num_frames == 1`. Not used for action runs (pass `action` instead).
+ Mutually exclusive with `video`.
+ video (`List[PIL.Image.Image]`, `torch.Tensor`, or `np.ndarray`, *optional*):
+ Optional conditioning clip for video-to-video. The leading frames are kept clean at the latent indexes
+ given by `condition_frame_indexes_vision` and the remaining frames are denoised. Each frame is
+ preprocessed (resized to `height`/`width`) like the `image` input. The canonical input is a list of PIL
+ frames, e.g. from `diffusers.utils.load_video`. Mutually exclusive with `image`; not used for action
+ runs (pass `action.video` instead).
+ condition_frame_indexes_vision (`List[int]`, *optional*):
+ Latent frame indexes to keep clean when `video` conditioning is supplied, e.g. `[0, 1]` (the default),
+ i.e. the first two latent frames (a 5 pixel-frame clip under 4x temporal compression). Only consulted
+ for video-to-video.
+ condition_video_keep (`str`, *optional*, defaults to `"first"`):
+ Which end of a longer source `video` to take the conditioning frames from: `"first"` or `"last"`. Only
+ consulted for video-to-video.
num_frames (`int`, *optional*, defaults to `None`):
Number of frames to generate. Use `1` for text-to-image. Defaults to `189` (≈ 7.9 s at 24 FPS) for
non-action modes when omitted (`None`). Must be `None` for action runs, where frame count is derived
@@ -1327,6 +1402,8 @@ def __call__(
enable_sound,
callback_on_step_end_tensor_inputs,
action,
+ video=video,
+ condition_frame_indexes_vision=condition_frame_indexes_vision,
)
# `action_mode` is the only action field consumed directly in __call__ (prompt template + output slicing);
@@ -1405,6 +1482,9 @@ def __call__(
action_condition_frame_indexes,
) = self.prepare_latents(
image=image,
+ video=video,
+ condition_frame_indexes_vision=condition_frame_indexes_vision,
+ condition_video_keep=condition_video_keep,
num_frames=num_frames,
height=height,
width=width,