From 8b0c7fca5e1ee666a81d4411f27bba76d02cc213 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Wed, 11 Mar 2026 10:23:44 +0800 Subject: [PATCH 1/5] temp commit --- diffsynth/utils/data/media_io_ltx2.py | 10 ++++ .../model_inference/LTX-2.3-A2V-TwoStage.py | 59 +++++++++++++++++++ pyproject.toml | 1 + 3 files changed, 70 insertions(+) create mode 100644 examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py diff --git a/diffsynth/utils/data/media_io_ltx2.py b/diffsynth/utils/data/media_io_ltx2.py index 689d8afc..d585c91e 100644 --- a/diffsynth/utils/data/media_io_ltx2.py +++ b/diffsynth/utils/data/media_io_ltx2.py @@ -7,6 +7,7 @@ import numpy as np from io import BytesIO from collections.abc import Generator, Iterator +import torchaudio def _resample_audio( @@ -137,6 +138,15 @@ def write_video_audio_ltx2( container.close() +def read_audio_with_torchaudio(path: str, start_time: float = 0, duration: float | None = None) -> torch.Tensor: + waveform, sample_rate = torchaudio.load(path, channels_first=True) + start_frame = int(start_time * sample_rate) + if start_frame > waveform.shape[-1]: + raise ValueError(f"start_time of {start_time} exceeds max duration of {waveform.shape[-1] / sample_rate:.2f}") + end_frame = -1 if duration is None else int(duration * sample_rate) + return waveform[..., start_frame:end_frame] + + def encode_single_frame(output_file: str, image_array: np.ndarray, crf: float) -> None: container = av.open(output_file, "w", format="mp4") try: diff --git a/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py b/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py new file mode 100644 index 00000000..e7cb86dc --- /dev/null +++ b/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py @@ -0,0 +1,59 @@ +import torch +from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig +from diffsynth.utils.data.media_io_ltx2 import read_audio_with_torchaudio, write_video_audio_ltx2 + +audio = read_audio_with_torchaudio("data/example_video_dataset/ltx2/sing.MP3") + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cuda", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = LTX2AudioVideoPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), + stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-distilled-lora-384.safetensors"), +) +prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”" +negative_prompt = ( + "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " + "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " + "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, " + "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of " + "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent " + "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny " + "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, " + "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, " + "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward " + "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " + "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." +) +height, width, num_frames = 512 * 2, 768 * 2, 121 +video, audio = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + seed=43, + height=height, + width=width, + num_frames=num_frames, + tiled=True, + use_two_stage_pipeline=True, +) +write_video_audio_ltx2( + video=video, + audio=audio, + output_path='ltx2.3_twostage.mp4', + fps=24, + audio_sample_rate=pipe.audio_vocoder.output_sampling_rate, +) diff --git a/pyproject.toml b/pyproject.toml index c6ba7672..5ddc5606 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ requires-python = ">=3.10.1" dependencies = [ "torch>=2.0.0", "torchvision", + "torchaudio", "transformers", "imageio", "imageio[ffmpeg]", From 26cd2834cbb0b79eb5d9839d130ce9d343f58ae6 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Wed, 11 Mar 2026 14:22:02 +0800 Subject: [PATCH 2/5] support ltx2 a2v --- diffsynth/pipelines/ltx2_audio_video.py | 74 +++++++++++++------ diffsynth/utils/data/media_io_ltx2.py | 29 +++++--- .../model_inference/LTX-2.3-A2V-TwoStage.py | 18 +++-- 3 files changed, 83 insertions(+), 38 deletions(-) diff --git a/diffsynth/pipelines/ltx2_audio_video.py b/diffsynth/pipelines/ltx2_audio_video.py index ba25f6df..82a669fd 100644 --- a/diffsynth/pipelines/ltx2_audio_video.py +++ b/diffsynth/pipelines/ltx2_audio_video.py @@ -67,8 +67,9 @@ def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16): LTX2AudioVideoUnit_SwitchStage2(), LTX2AudioVideoUnit_NoiseInitializer(), LTX2AudioVideoUnit_LatentsUpsampler(), - LTX2AudioVideoUnit_SetScheduleStage2(), LTX2AudioVideoUnit_InputImagesEmbedder(), + LTX2AudioVideoUnit_InputAudioEmbedder(), + LTX2AudioVideoUnit_SetScheduleStage2(), ] self.model_fn = model_fn_ltx2 @@ -155,8 +156,9 @@ def denoise_stage(self, inputs_shared, inputs_posi, inputs_nega, units, cfg_scal **models, timestep=timestep, progress_id=progress_id ) inputs_shared["video_latents"] = self.step(self.scheduler, inputs_shared["video_latents"], progress_id=progress_id, noise_pred=noise_pred_video, - inpaint_mask=inputs_shared.get("denoise_mask_video", None), input_latents=inputs_shared.get("input_latents_video", None), **inputs_shared) - inputs_shared["audio_latents"] = self.step(self.scheduler, inputs_shared["audio_latents"], progress_id=progress_id, noise_pred=noise_pred_audio, **inputs_shared) + inpaint_mask=inputs_shared.get("video_denoise_mask", None), input_latents=inputs_shared.get("video_input_latents", None), **inputs_shared) + inputs_shared["audio_latents"] = self.step(self.scheduler, inputs_shared["audio_latents"], progress_id=progress_id, noise_pred=noise_pred_audio, + inpaint_mask=inputs_shared.get("audio_denoise_mask", None), input_latents=inputs_shared.get("audio_input_latents", None), **inputs_shared) return inputs_shared, inputs_posi, inputs_nega @torch.no_grad() @@ -173,6 +175,9 @@ def __call__( # In-Context Video Control in_context_videos: Optional[list[list[Image.Image]]] = None, in_context_downsample_factor: Optional[int] = 2, + # Audio-to-video + input_audio: Optional[torch.Tensor] = None, + audio_sample_rate: Optional[int] = 48000, # Randomness seed: Optional[int] = None, rand_device: Optional[str] = "cpu", @@ -210,6 +215,7 @@ def __call__( } inputs_shared = { "input_images": input_images, "input_images_indexes": input_images_indexes, "input_images_strength": input_images_strength, + "input_audio": (input_audio, audio_sample_rate) if input_audio is not None else None, "in_context_videos": in_context_videos, "in_context_downsample_factor": in_context_downsample_factor, "seed": seed, "rand_device": rand_device, "height": height, "width": width, "num_frames": num_frames, "frame_rate": frame_rate, @@ -361,7 +367,8 @@ def process(self, pipe: LTX2AudioVideoPipeline, input_video, video_noise, tiled, input_video = pipe.preprocess_video(input_video) input_latents = pipe.video_vae_encoder.encode(input_video, tiled, tile_size_in_pixels, tile_overlap_in_pixels).to(dtype=pipe.torch_dtype, device=pipe.device) if pipe.scheduler.training: - return {"video_latents": input_latents, "input_latents": input_latents} + # input_latents key is for training to add noise. with no prefix "video" to keep loss function keyword consistent. + return {"video_latents": video_noise, "input_latents": input_latents} else: raise NotImplementedError("Video-to-video not implemented yet.") @@ -370,7 +377,7 @@ class LTX2AudioVideoUnit_InputAudioEmbedder(PipelineUnit): def __init__(self): super().__init__( input_params=("input_audio", "audio_noise"), - output_params=("audio_latents", "audio_input_latents", "audio_positions", "audio_latent_shape"), + output_params=("audio_latents", "audio_input_latents", "audio_noise", "audio_positions", "audio_latent_shape", "audio_denoise_mask"), onload_model_names=("audio_vae_encoder",) ) @@ -380,21 +387,37 @@ def process(self, pipe: LTX2AudioVideoPipeline, input_audio, audio_noise): else: input_audio, sample_rate = input_audio pipe.load_models_to_device(self.onload_model_names) - input_audio = pipe.audio_processor.waveform_to_mel(input_audio.unsqueeze(0), waveform_sample_rate=sample_rate).to(dtype=pipe.torch_dtype) + input_audio = pipe.audio_processor.waveform_to_mel(input_audio.unsqueeze(0), waveform_sample_rate=sample_rate).to(dtype=pipe.torch_dtype, device=pipe.device) audio_input_latents = pipe.audio_vae_encoder(input_audio) audio_latent_shape = AudioLatentShape.from_torch_shape(audio_input_latents.shape) audio_positions = pipe.audio_patchifier.get_patch_grid_bounds(audio_latent_shape, device=pipe.device) if pipe.scheduler.training: - return {"audio_latents": audio_input_latents, "audio_input_latents": audio_input_latents, "audio_positions": audio_positions, "audio_latent_shape": audio_latent_shape} + return { + "audio_latents": audio_input_latents, + "audio_input_latents": audio_input_latents, + "audio_positions": audio_positions, + "audio_latent_shape": audio_latent_shape, + } else: - raise NotImplementedError("Audio-to-video not supported.") + b, c, t, f = audio_input_latents.shape + audio_denoise_mask = torch.zeros((b, 1, t, 1), device=audio_input_latents.device, dtype=audio_input_latents.dtype) + audio_noise = torch.rand_like(audio_input_latents) + audio_latents = pipe.scheduler.add_noise(audio_input_latents, audio_noise, pipe.scheduler.timesteps[0]) + return { + "audio_latents": audio_latents, + "audio_input_latents": audio_input_latents, + "audio_noise": audio_noise, + "audio_positions": audio_positions, + "audio_latent_shape": audio_latent_shape, + "audio_denoise_mask": audio_denoise_mask, + } class LTX2AudioVideoUnit_InputImagesEmbedder(PipelineUnit): def __init__(self): super().__init__( input_params=("input_images", "input_images_indexes", "input_images_strength", "video_latents", "height", "width", "frame_rate", "tiled", "tile_size_in_pixels", "tile_overlap_in_pixels", "initial_latents"), - output_params=("denoise_mask_video", "input_latents_video", "ref_frames_latents", "ref_frames_positions"), + output_params=("video_denoise_mask", "video_input_latents", "ref_frames_latents", "ref_frames_positions"), onload_model_names=("video_vae_encoder") ) @@ -406,9 +429,9 @@ def get_image_latent(self, pipe, input_image, height, width, tiled, tile_size_in latents = pipe.video_vae_encoder.encode(image, tiled, tile_size_in_pixels, tile_overlap_in_pixels).to(pipe.device) return latents - def apply_input_images_to_latents(self, latents, input_latents, input_indexes, input_strength=1.0, initial_latents=None, denoise_mask_video=None): + def apply_input_images_to_latents(self, latents, input_latents, input_indexes, input_strength=1.0, initial_latents=None, video_denoise_mask=None): b, _, f, h, w = latents.shape - denoise_mask = torch.ones((b, 1, f, h, w), dtype=latents.dtype, device=latents.device) if denoise_mask_video is None else denoise_mask_video + denoise_mask = torch.ones((b, 1, f, h, w), dtype=latents.dtype, device=latents.device) if video_denoise_mask is None else video_denoise_mask initial_latents = torch.zeros_like(latents) if initial_latents is None else initial_latents for idx, input_latent in zip(input_indexes, input_latents): idx = min(max(1 + (idx-1) // 8, 0), f - 1) @@ -424,13 +447,13 @@ def process(self, pipe: LTX2AudioVideoPipeline, video_latents, input_images, hei if len(input_images_indexes) != len(set(input_images_indexes)): raise ValueError("Input images must have unique indexes.") pipe.load_models_to_device(self.onload_model_names) - frame_conditions = {"input_latents_video": None, "denoise_mask_video": None, "ref_frames_latents": [], "ref_frames_positions": []} + frame_conditions = {"video_input_latents": None, "video_denoise_mask": None, "ref_frames_latents": [], "ref_frames_positions": []} for img, index in zip(input_images, input_images_indexes): latents = self.get_image_latent(pipe, img, height, width, tiled, tile_size_in_pixels, tile_overlap_in_pixels) # first_frame by replacing latents if index == 0: - input_latents_video, denoise_mask_video = self.apply_input_images_to_latents(video_latents, [latents], [0], input_images_strength, initial_latents) - frame_conditions.update({"input_latents_video": input_latents_video, "denoise_mask_video": denoise_mask_video}) + video_input_latents, video_denoise_mask = self.apply_input_images_to_latents(video_latents, [latents], [0], input_images_strength, initial_latents) + frame_conditions.update({"video_input_latents": video_input_latents, "video_denoise_mask": video_denoise_mask}) # other frames by adding reference latents else: latent_coords = pipe.video_patchifier.get_patch_grid_bounds(output_shape=VideoLatentShape.from_torch_shape(latents.shape), device=pipe.device) @@ -560,14 +583,17 @@ def model_fn_ltx2( audio_patchifier=None, timestep=None, # First Frame Conditioning - input_latents_video=None, - denoise_mask_video=None, + video_input_latents=None, + video_denoise_mask=None, # Other Frames Conditioning ref_frames_latents=None, ref_frames_positions=None, # In-Context Conditioning in_context_video_latents=None, in_context_video_positions=None, + # Audio Inputs + audio_input_latents=None, + audio_denoise_mask=None, # Gradient Checkpointing use_gradient_checkpointing=False, use_gradient_checkpointing_offload=False, @@ -581,12 +607,12 @@ def model_fn_ltx2( seq_len_video = video_latents.shape[1] video_timesteps = timestep.repeat(1, video_latents.shape[1], 1) # Frist frame conditioning by replacing the video latents - if input_latents_video is not None: - denoise_mask_video = video_patchifier.patchify(denoise_mask_video) - video_latents = video_latents * denoise_mask_video + video_patchifier.patchify(input_latents_video) * (1.0 - denoise_mask_video) - video_timesteps = denoise_mask_video * video_timesteps - - # Conditioning by replacing the video latents + if video_input_latents is not None: + video_denoise_mask = video_patchifier.patchify(video_denoise_mask) + video_latents = video_latents * video_denoise_mask + video_patchifier.patchify(video_input_latents) * (1.0 - video_denoise_mask) + video_timesteps = video_denoise_mask * video_timesteps + + # Reference conditioning by appending the reference video or frame latents total_ref_latents = ref_frames_latents if ref_frames_latents is not None else [] total_ref_positions = ref_frames_positions if ref_frames_positions is not None else [] total_ref_latents += [in_context_video_latents] if in_context_video_latents is not None else [] @@ -605,6 +631,10 @@ def model_fn_ltx2( audio_timesteps = timestep.repeat(1, audio_latents.shape[1], 1) else: audio_timesteps = None + if audio_input_latents is not None: + audio_denoise_mask = audio_patchifier.patchify(audio_denoise_mask) + audio_latents = audio_latents * audio_denoise_mask + audio_patchifier.patchify(audio_input_latents) * (1.0 - audio_denoise_mask) + audio_timesteps = audio_denoise_mask * audio_timesteps vx, ax = dit( video_latents=video_latents, diff --git a/diffsynth/utils/data/media_io_ltx2.py b/diffsynth/utils/data/media_io_ltx2.py index d585c91e..90aed07d 100644 --- a/diffsynth/utils/data/media_io_ltx2.py +++ b/diffsynth/utils/data/media_io_ltx2.py @@ -1,13 +1,11 @@ - from fractions import Fraction import torch +import torchaudio import av from tqdm import tqdm from PIL import Image import numpy as np from io import BytesIO -from collections.abc import Generator, Iterator -import torchaudio def _resample_audio( @@ -70,9 +68,9 @@ def _prepare_audio_stream(container: av.container.Container, audio_sample_rate: audio_stream = container.add_stream("aac") supported_sample_rates = audio_stream.codec_context.codec.audio_rates if supported_sample_rates: - best_rate = min(supported_sample_rates, key=lambda x: abs(x - audio_sample_rate)) - if best_rate != audio_sample_rate: - print(f"Using closest supported audio sample rate: {best_rate}") + best_rate = min(supported_sample_rates, key=lambda x: abs(x - audio_sample_rate)) + if best_rate != audio_sample_rate: + print(f"Using closest supported audio sample rate: {best_rate}") else: best_rate = audio_sample_rate audio_stream.codec_context.sample_rate = best_rate @@ -117,7 +115,7 @@ def write_video_audio_ltx2( stream.width = width stream.height = height stream.pix_fmt = "yuv420p" - + if audio is not None: if audio_sample_rate is None: raise ValueError("audio_sample_rate is required when audio is provided") @@ -138,13 +136,24 @@ def write_video_audio_ltx2( container.close() -def read_audio_with_torchaudio(path: str, start_time: float = 0, duration: float | None = None) -> torch.Tensor: +def resample_waveform(waveform: torch.Tensor, source_rate: int, target_rate: int) -> torch.Tensor: + """Resample waveform to target sample rate if needed.""" + if source_rate == target_rate: + return waveform + resampled = torchaudio.functional.resample(waveform, source_rate, target_rate) + return resampled.to(dtype=waveform.dtype) + + +def read_audio_with_torchaudio(path: str, start_time: float = 0, duration: float | None = None, resample: bool = False, resample_rate: int = 48000) -> torch.Tensor: waveform, sample_rate = torchaudio.load(path, channels_first=True) + if resample: + waveform = resample_waveform(waveform, sample_rate, resample_rate) + sample_rate = resample_rate start_frame = int(start_time * sample_rate) if start_frame > waveform.shape[-1]: raise ValueError(f"start_time of {start_time} exceeds max duration of {waveform.shape[-1] / sample_rate:.2f}") - end_frame = -1 if duration is None else int(duration * sample_rate) - return waveform[..., start_frame:end_frame] + end_frame = -1 if duration is None else int(duration * sample_rate + start_frame) + return waveform[..., start_frame:end_frame], sample_rate def encode_single_frame(output_file: str, image_array: np.ndarray, crf: float) -> None: diff --git a/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py b/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py index e7cb86dc..d42dade1 100644 --- a/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py +++ b/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py @@ -1,8 +1,7 @@ import torch from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig from diffsynth.utils.data.media_io_ltx2 import read_audio_with_torchaudio, write_video_audio_ltx2 - -audio = read_audio_with_torchaudio("data/example_video_dataset/ltx2/sing.MP3") +from modelscope import dataset_snapshot_download vram_config = { "offload_dtype": torch.bfloat16, @@ -25,7 +24,9 @@ tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-distilled-lora-384.safetensors"), ) -prompt = "A girl is very happy, she is speaking: “I enjoy working with Diffsynth-Studio, it's a perfect framework.”" + +dataset_snapshot_download("DiffSynth-Studio/example_video_dataset", allow_file_pattern="ltx2/*", local_dir="data/example_video_dataset") +prompt = "A beautiful woman with a flower crown is singing happily under a blooming cherry tree." negative_prompt = ( "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " @@ -39,21 +40,26 @@ "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." ) -height, width, num_frames = 512 * 2, 768 * 2, 121 +height, width, num_frames, frame_rate = 512 * 2, 768 * 2, 121, 24 +duration = num_frames / frame_rate +audio, audio_sample_rate = read_audio_with_torchaudio("data/example_video_dataset/ltx2/sing.MP3", start_time=1, duration=duration) video, audio = pipe( prompt=prompt, negative_prompt=negative_prompt, + input_audio=audio, + audio_sample_rate=audio_sample_rate, seed=43, height=height, width=width, num_frames=num_frames, + frame_rate=frame_rate, tiled=True, use_two_stage_pipeline=True, ) write_video_audio_ltx2( video=video, audio=audio, - output_path='ltx2.3_twostage.mp4', - fps=24, + output_path='ltx2.3_twostage_a2v.mp4', + fps=frame_rate, audio_sample_rate=pipe.audio_vocoder.output_sampling_rate, ) From 38bab47181e28d8f42649cf6879e3aab56c88a82 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Wed, 11 Mar 2026 20:07:16 +0800 Subject: [PATCH 3/5] support ltx2.3 retake video and audio --- README.md | 2 + README_zh.md | 2 + diffsynth/pipelines/ltx2_audio_video.py | 191 ++++++++++++------ docs/en/Model_Details/LTX-2.md | 2 + docs/zh/Model_Details/LTX-2.md | 2 + .../model_inference/LTX-2.3-A2V-TwoStage.py | 2 +- .../LTX-2.3-T2AV-TwoStage-Retake.py | 76 +++++++ .../LTX-2.3-A2V-TwoStage.py | 66 ++++++ .../LTX-2.3-T2AV-TwoStage-Retake.py | 77 +++++++ 9 files changed, 359 insertions(+), 61 deletions(-) create mode 100644 examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py create mode 100644 examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py create mode 100644 examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py diff --git a/README.md b/README.md index e28ea4bc..3db16fe3 100644 --- a/README.md +++ b/README.md @@ -711,6 +711,8 @@ Example code for LTX-2 is available at: [/examples/ltx2/](/examples/ltx2/) |[Lightricks/LTX-2.3: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV.py)| |[Lightricks/LTX-2.3: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage.py)|-|-|-|-| |[Lightricks/LTX-2.3: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-DistilledPipeline.py)|-|-|-|-| +|[Lightricks/LTX-2.3: A2V](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py)|-|-|-|-| +|[Lightricks/LTX-2.3: Retake](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_video`,`retake_video_regions`,`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py)|-|-|-|-| |[Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)| |[Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)| |[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV.py)| diff --git a/README_zh.md b/README_zh.md index f1bf7dae..fbc8aaac 100644 --- a/README_zh.md +++ b/README_zh.md @@ -711,6 +711,8 @@ LTX-2 的示例代码位于:[/examples/ltx2/](/examples/ltx2/) |[Lightricks/LTX-2.3: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2.3-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV.py)| |[Lightricks/LTX-2.3: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage.py)|-|-|-|-| |[Lightricks/LTX-2.3: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-DistilledPipeline.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-DistilledPipeline.py)|-|-|-|-| +|[Lightricks/LTX-2.3: A2V](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py)|-|-|-|-| +|[Lightricks/LTX-2.3: Retake](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_video`,`retake_video_regions`,`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py)|-|-|-|-| |[Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)| |[Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|-|-|[code](/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)| |[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|[code](/examples/ltx2/model_training/full/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_full/LTX-2-T2AV.py)|[code](/examples/ltx2/model_training/lora/LTX-2-T2AV-splited.sh)|[code](/examples/ltx2/model_training/validate_lora/LTX-2-T2AV.py)| diff --git a/diffsynth/pipelines/ltx2_audio_video.py b/diffsynth/pipelines/ltx2_audio_video.py index 82a669fd..e2cd3453 100644 --- a/diffsynth/pipelines/ltx2_audio_video.py +++ b/diffsynth/pipelines/ltx2_audio_video.py @@ -58,6 +58,8 @@ def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16): LTX2AudioVideoUnit_ShapeChecker(), LTX2AudioVideoUnit_PromptEmbedder(), LTX2AudioVideoUnit_NoiseInitializer(), + LTX2AudioVideoUnit_VideoRetakeEmbedder(), + LTX2AudioVideoUnit_AudioRetakeEmbedder(), LTX2AudioVideoUnit_InputAudioEmbedder(), LTX2AudioVideoUnit_InputVideoEmbedder(), LTX2AudioVideoUnit_InputImagesEmbedder(), @@ -67,8 +69,9 @@ def __init__(self, device=get_device_type(), torch_dtype=torch.bfloat16): LTX2AudioVideoUnit_SwitchStage2(), LTX2AudioVideoUnit_NoiseInitializer(), LTX2AudioVideoUnit_LatentsUpsampler(), + LTX2AudioVideoUnit_VideoRetakeEmbedder(), + LTX2AudioVideoUnit_AudioRetakeEmbedder(), LTX2AudioVideoUnit_InputImagesEmbedder(), - LTX2AudioVideoUnit_InputAudioEmbedder(), LTX2AudioVideoUnit_SetScheduleStage2(), ] self.model_fn = model_fn_ltx2 @@ -156,9 +159,9 @@ def denoise_stage(self, inputs_shared, inputs_posi, inputs_nega, units, cfg_scal **models, timestep=timestep, progress_id=progress_id ) inputs_shared["video_latents"] = self.step(self.scheduler, inputs_shared["video_latents"], progress_id=progress_id, noise_pred=noise_pred_video, - inpaint_mask=inputs_shared.get("video_denoise_mask", None), input_latents=inputs_shared.get("video_input_latents", None), **inputs_shared) + inpaint_mask=inputs_shared.get("denoise_mask_video", None), input_latents=inputs_shared.get("input_latents_video", None), **inputs_shared) inputs_shared["audio_latents"] = self.step(self.scheduler, inputs_shared["audio_latents"], progress_id=progress_id, noise_pred=noise_pred_audio, - inpaint_mask=inputs_shared.get("audio_denoise_mask", None), input_latents=inputs_shared.get("audio_input_latents", None), **inputs_shared) + inpaint_mask=inputs_shared.get("denoise_mask_audio", None), input_latents=inputs_shared.get("input_latents_audio", None), **inputs_shared) return inputs_shared, inputs_posi, inputs_nega @torch.no_grad() @@ -175,9 +178,13 @@ def __call__( # In-Context Video Control in_context_videos: Optional[list[list[Image.Image]]] = None, in_context_downsample_factor: Optional[int] = 2, + # Video-to-video + retake_video: Optional[list[Image.Image]] = None, + retake_video_regions: Optional[list[tuple[float, float]]] = None, # Audio-to-video - input_audio: Optional[torch.Tensor] = None, + retake_audio: Optional[torch.Tensor] = None, audio_sample_rate: Optional[int] = 48000, + retake_audio_regions: Optional[list[tuple[float, float]]] = None, # Randomness seed: Optional[int] = None, rand_device: Optional[str] = "cpu", @@ -215,7 +222,8 @@ def __call__( } inputs_shared = { "input_images": input_images, "input_images_indexes": input_images_indexes, "input_images_strength": input_images_strength, - "input_audio": (input_audio, audio_sample_rate) if input_audio is not None else None, + "retake_video": retake_video, "retake_video_regions": retake_video_regions, + "retake_audio": (retake_audio, audio_sample_rate) if retake_audio is not None else None, "retake_audio_regions": retake_audio_regions, "in_context_videos": in_context_videos, "in_context_downsample_factor": in_context_downsample_factor, "seed": seed, "rand_device": rand_device, "height": height, "width": width, "num_frames": num_frames, "frame_rate": frame_rate, @@ -360,64 +368,110 @@ def __init__(self): ) def process(self, pipe: LTX2AudioVideoPipeline, input_video, video_noise, tiled, tile_size_in_pixels, tile_overlap_in_pixels): - if input_video is None: + if input_video is None or not pipe.scheduler.training: return {"video_latents": video_noise} else: pipe.load_models_to_device(self.onload_model_names) input_video = pipe.preprocess_video(input_video) input_latents = pipe.video_vae_encoder.encode(input_video, tiled, tile_size_in_pixels, tile_overlap_in_pixels).to(dtype=pipe.torch_dtype, device=pipe.device) - if pipe.scheduler.training: - # input_latents key is for training to add noise. with no prefix "video" to keep loss function keyword consistent. - return {"video_latents": video_noise, "input_latents": input_latents} - else: - raise NotImplementedError("Video-to-video not implemented yet.") - + return {"video_latents": input_latents, "input_latents": input_latents} class LTX2AudioVideoUnit_InputAudioEmbedder(PipelineUnit): def __init__(self): super().__init__( input_params=("input_audio", "audio_noise"), - output_params=("audio_latents", "audio_input_latents", "audio_noise", "audio_positions", "audio_latent_shape", "audio_denoise_mask"), + output_params=("audio_latents", "audio_input_latents", "audio_positions", "audio_latent_shape"), onload_model_names=("audio_vae_encoder",) ) def process(self, pipe: LTX2AudioVideoPipeline, input_audio, audio_noise): - if input_audio is None: + if input_audio is None or not pipe.scheduler.training: return {"audio_latents": audio_noise} else: input_audio, sample_rate = input_audio pipe.load_models_to_device(self.onload_model_names) - input_audio = pipe.audio_processor.waveform_to_mel(input_audio.unsqueeze(0), waveform_sample_rate=sample_rate).to(dtype=pipe.torch_dtype, device=pipe.device) + input_audio = pipe.audio_processor.waveform_to_mel(input_audio.unsqueeze(0), waveform_sample_rate=sample_rate).to(dtype=pipe.torch_dtype) audio_input_latents = pipe.audio_vae_encoder(input_audio) audio_latent_shape = AudioLatentShape.from_torch_shape(audio_input_latents.shape) audio_positions = pipe.audio_patchifier.get_patch_grid_bounds(audio_latent_shape, device=pipe.device) - if pipe.scheduler.training: - return { - "audio_latents": audio_input_latents, - "audio_input_latents": audio_input_latents, - "audio_positions": audio_positions, - "audio_latent_shape": audio_latent_shape, - } - else: - b, c, t, f = audio_input_latents.shape - audio_denoise_mask = torch.zeros((b, 1, t, 1), device=audio_input_latents.device, dtype=audio_input_latents.dtype) - audio_noise = torch.rand_like(audio_input_latents) - audio_latents = pipe.scheduler.add_noise(audio_input_latents, audio_noise, pipe.scheduler.timesteps[0]) - return { - "audio_latents": audio_latents, - "audio_input_latents": audio_input_latents, - "audio_noise": audio_noise, - "audio_positions": audio_positions, - "audio_latent_shape": audio_latent_shape, - "audio_denoise_mask": audio_denoise_mask, - } + return {"audio_latents": audio_input_latents, "audio_input_latents": audio_input_latents, "audio_positions": audio_positions, "audio_latent_shape": audio_latent_shape} + + +class LTX2AudioVideoUnit_VideoRetakeEmbedder(PipelineUnit): + def __init__(self): + super().__init__( + input_params=("retake_video", "height", "width", "tiled", "tile_size_in_pixels", "tile_overlap_in_pixels", "video_positions", "retake_video_regions"), + output_params=("input_latents_video", "denoise_mask_video"), + onload_model_names=("video_vae_encoder") + ) + + def process(self, pipe: LTX2AudioVideoPipeline, retake_video, height, width, tiled, tile_size_in_pixels, tile_overlap_in_pixels, video_positions, retake_video_regions=None): + if retake_video is None: + return {} + pipe.load_models_to_device(self.onload_model_names) + resized_video = [frame.resize((width, height)) for frame in retake_video] + input_video = pipe.preprocess_video(resized_video) + input_latents_video = pipe.video_vae_encoder.encode(input_video, tiled, tile_size_in_pixels, tile_overlap_in_pixels).to(dtype=pipe.torch_dtype, device=pipe.device) + + b, c, f, h, w = input_latents_video.shape + denoise_mask_video = torch.zeros((b, 1, f, h, w), device=input_latents_video.device, dtype=input_latents_video.dtype) + if retake_video_regions is not None and len(retake_video_regions) > 0: + for start_time, end_time in retake_video_regions: + t_start, t_end = video_positions[0, 0].unbind(dim=-1) + in_region = (t_end >= start_time) & (t_start <= end_time) + in_region = pipe.video_patchifier.unpatchify_video(in_region.unsqueeze(0).unsqueeze(-1), f, h, w) + denoise_mask_video = torch.where(in_region, torch.ones_like(denoise_mask_video), denoise_mask_video) + + return {"input_latents_video": input_latents_video, "denoise_mask_video": denoise_mask_video} + + +class LTX2AudioVideoUnit_AudioRetakeEmbedder(PipelineUnit): + """ + Functionality of audio2video, audio retaking. + """ + def __init__(self): + super().__init__( + input_params=("retake_audio", "seed", "rand_device", "retake_audio_regions"), + output_params=("input_latents_audio", "audio_noise", "audio_positions", "audio_latent_shape", "denoise_mask_audio", "audio_latents"), + onload_model_names=("audio_vae_encoder",) + ) + + def process(self, pipe: LTX2AudioVideoPipeline, retake_audio, seed, rand_device, retake_audio_regions=None): + if retake_audio is None: + return {} + else: + input_audio, sample_rate = retake_audio + pipe.load_models_to_device(self.onload_model_names) + input_audio = pipe.audio_processor.waveform_to_mel(input_audio.unsqueeze(0), waveform_sample_rate=sample_rate).to(dtype=pipe.torch_dtype, device=pipe.device) + input_latents_audio = pipe.audio_vae_encoder(input_audio) + audio_latent_shape = AudioLatentShape.from_torch_shape(input_latents_audio.shape) + audio_positions = pipe.audio_patchifier.get_patch_grid_bounds(audio_latent_shape, device=pipe.device) + # Regenerate noise for the new shape if retake_audio is provided, to avoid shape mismatch. + audio_noise = pipe.generate_noise(input_latents_audio.shape, seed=seed, rand_device=rand_device) + + b, c, t, f = input_latents_audio.shape + denoise_mask_audio = torch.zeros((b, 1, t, 1), device=input_latents_audio.device, dtype=input_latents_audio.dtype) + if retake_audio_regions is not None and len(retake_audio_regions) > 0: + for start_time, end_time in retake_audio_regions: + t_start, t_end = audio_positions[:, 0, :, 0], audio_positions[:, 0, :, 1] + in_region = (t_end >= start_time) & (t_start <= end_time) + in_region = pipe.audio_patchifier.unpatchify_audio(in_region.unsqueeze(-1), 1, 1) + denoise_mask_audio = torch.where(in_region, torch.ones_like(denoise_mask_audio), denoise_mask_audio) + + return { + "input_latents_audio": input_latents_audio, + "denoise_mask_audio": denoise_mask_audio, + "audio_noise": audio_noise, + "audio_positions": audio_positions, + "audio_latent_shape": audio_latent_shape, + } class LTX2AudioVideoUnit_InputImagesEmbedder(PipelineUnit): def __init__(self): super().__init__( - input_params=("input_images", "input_images_indexes", "input_images_strength", "video_latents", "height", "width", "frame_rate", "tiled", "tile_size_in_pixels", "tile_overlap_in_pixels", "initial_latents"), - output_params=("video_denoise_mask", "video_input_latents", "ref_frames_latents", "ref_frames_positions"), + input_params=("input_images", "input_images_indexes", "input_images_strength", "video_latents", "height", "width", "frame_rate", "tiled", "tile_size_in_pixels", "tile_overlap_in_pixels", "input_latents_video", "denoise_mask_video"), + output_params=("denoise_mask_video", "input_latents_video", "ref_frames_latents", "ref_frames_positions"), onload_model_names=("video_vae_encoder") ) @@ -429,31 +483,47 @@ def get_image_latent(self, pipe, input_image, height, width, tiled, tile_size_in latents = pipe.video_vae_encoder.encode(image, tiled, tile_size_in_pixels, tile_overlap_in_pixels).to(pipe.device) return latents - def apply_input_images_to_latents(self, latents, input_latents, input_indexes, input_strength=1.0, initial_latents=None, video_denoise_mask=None): + def apply_input_images_to_latents(self, latents, input_latents, input_indexes, input_strength=1.0, input_latents_video=None, denoise_mask_video=None): b, _, f, h, w = latents.shape - denoise_mask = torch.ones((b, 1, f, h, w), dtype=latents.dtype, device=latents.device) if video_denoise_mask is None else video_denoise_mask - initial_latents = torch.zeros_like(latents) if initial_latents is None else initial_latents + denoise_mask = torch.ones((b, 1, f, h, w), dtype=latents.dtype, device=latents.device) if denoise_mask_video is None else denoise_mask_video + input_latents_video = torch.zeros_like(latents) if input_latents_video is None else input_latents_video for idx, input_latent in zip(input_indexes, input_latents): idx = min(max(1 + (idx-1) // 8, 0), f - 1) input_latent = input_latent.to(dtype=latents.dtype, device=latents.device) - initial_latents[:, :, idx:idx + input_latent.shape[2], :, :] = input_latent + input_latents_video[:, :, idx:idx + input_latent.shape[2], :, :] = input_latent denoise_mask[:, :, idx:idx + input_latent.shape[2], :, :] = 1.0 - input_strength - return initial_latents, denoise_mask + return input_latents_video, denoise_mask - def process(self, pipe: LTX2AudioVideoPipeline, video_latents, input_images, height, width, frame_rate, tiled, tile_size_in_pixels, tile_overlap_in_pixels, input_images_indexes=[0], input_images_strength=1.0, initial_latents=None): + def process( + self, + pipe: LTX2AudioVideoPipeline, + video_latents, + input_images, + height, + width, + frame_rate, + tiled, + tile_size_in_pixels, + tile_overlap_in_pixels, + input_images_indexes=[0], + input_images_strength=1.0, + input_latents_video=None, + denoise_mask_video=None, + ): if input_images is None or len(input_images) == 0: return {} else: if len(input_images_indexes) != len(set(input_images_indexes)): raise ValueError("Input images must have unique indexes.") pipe.load_models_to_device(self.onload_model_names) - frame_conditions = {"video_input_latents": None, "video_denoise_mask": None, "ref_frames_latents": [], "ref_frames_positions": []} + frame_conditions = {"input_latents_video": None, "denoise_mask_video": None, "ref_frames_latents": [], "ref_frames_positions": []} for img, index in zip(input_images, input_images_indexes): latents = self.get_image_latent(pipe, img, height, width, tiled, tile_size_in_pixels, tile_overlap_in_pixels) # first_frame by replacing latents if index == 0: - video_input_latents, video_denoise_mask = self.apply_input_images_to_latents(video_latents, [latents], [0], input_images_strength, initial_latents) - frame_conditions.update({"video_input_latents": video_input_latents, "video_denoise_mask": video_denoise_mask}) + input_latents_video, denoise_mask_video = self.apply_input_images_to_latents( + video_latents, [latents], [0], input_images_strength, input_latents_video, denoise_mask_video) + frame_conditions.update({"input_latents_video": input_latents_video, "denoise_mask_video": denoise_mask_video}) # other frames by adding reference latents else: latent_coords = pipe.video_patchifier.get_patch_grid_bounds(output_shape=VideoLatentShape.from_torch_shape(latents.shape), device=pipe.device) @@ -531,6 +601,7 @@ def process(self, pipe: LTX2AudioVideoPipeline, stage_2_height, stage_2_width, c stage2_params = {} stage2_params.update({"height": stage_2_height, "width": stage_2_width}) stage2_params.update({"in_context_video_latents": None, "in_context_video_positions": None}) + stage2_params.update({"input_latents_video": None, "denoise_mask_video": None}) if clear_lora_before_state_two: pipe.clear_lora() if not use_distilled_pipeline: @@ -556,7 +627,7 @@ class LTX2AudioVideoUnit_LatentsUpsampler(PipelineUnit): def __init__(self): super().__init__( input_params=("video_latents",), - output_params=("video_latents", "initial_latents"), + output_params=("video_latents",), onload_model_names=("upsampler",), ) @@ -568,7 +639,7 @@ def process(self, pipe: LTX2AudioVideoPipeline, video_latents): video_latents = pipe.video_vae_encoder.per_channel_statistics.un_normalize(video_latents) video_latents = pipe.upsampler(video_latents) video_latents = pipe.video_vae_encoder.per_channel_statistics.normalize(video_latents) - return {"video_latents": video_latents, "initial_latents": video_latents} + return {"video_latents": video_latents} def model_fn_ltx2( @@ -583,8 +654,8 @@ def model_fn_ltx2( audio_patchifier=None, timestep=None, # First Frame Conditioning - video_input_latents=None, - video_denoise_mask=None, + input_latents_video=None, + denoise_mask_video=None, # Other Frames Conditioning ref_frames_latents=None, ref_frames_positions=None, @@ -592,8 +663,8 @@ def model_fn_ltx2( in_context_video_latents=None, in_context_video_positions=None, # Audio Inputs - audio_input_latents=None, - audio_denoise_mask=None, + input_latents_audio=None, + denoise_mask_audio=None, # Gradient Checkpointing use_gradient_checkpointing=False, use_gradient_checkpointing_offload=False, @@ -607,10 +678,10 @@ def model_fn_ltx2( seq_len_video = video_latents.shape[1] video_timesteps = timestep.repeat(1, video_latents.shape[1], 1) # Frist frame conditioning by replacing the video latents - if video_input_latents is not None: - video_denoise_mask = video_patchifier.patchify(video_denoise_mask) - video_latents = video_latents * video_denoise_mask + video_patchifier.patchify(video_input_latents) * (1.0 - video_denoise_mask) - video_timesteps = video_denoise_mask * video_timesteps + if input_latents_video is not None: + denoise_mask_video = video_patchifier.patchify(denoise_mask_video) + video_latents = video_latents * denoise_mask_video + video_patchifier.patchify(input_latents_video) * (1.0 - denoise_mask_video) + video_timesteps = denoise_mask_video * video_timesteps # Reference conditioning by appending the reference video or frame latents total_ref_latents = ref_frames_latents if ref_frames_latents is not None else [] @@ -631,10 +702,10 @@ def model_fn_ltx2( audio_timesteps = timestep.repeat(1, audio_latents.shape[1], 1) else: audio_timesteps = None - if audio_input_latents is not None: - audio_denoise_mask = audio_patchifier.patchify(audio_denoise_mask) - audio_latents = audio_latents * audio_denoise_mask + audio_patchifier.patchify(audio_input_latents) * (1.0 - audio_denoise_mask) - audio_timesteps = audio_denoise_mask * audio_timesteps + if input_latents_audio is not None: + denoise_mask_audio = audio_patchifier.patchify(denoise_mask_audio) + audio_latents = audio_latents * denoise_mask_audio + audio_patchifier.patchify(input_latents_audio) * (1.0 - denoise_mask_audio) + audio_timesteps = denoise_mask_audio * audio_timesteps vx, ax = dit( video_latents=video_latents, diff --git a/docs/en/Model_Details/LTX-2.md b/docs/en/Model_Details/LTX-2.md index db9937a2..18047f9a 100644 --- a/docs/en/Model_Details/LTX-2.md +++ b/docs/en/Model_Details/LTX-2.md @@ -117,6 +117,8 @@ write_video_audio_ltx2( |[Lightricks/LTX-2.3: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-OneStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-OneStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/full/LTX-2.3-T2AV-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_full/LTX-2.3-T2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/lora/LTX-2.3-T2AV-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV.py)| |[Lightricks/LTX-2.3: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage.py)|-|-|-|-| |[Lightricks/LTX-2.3: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-DistilledPipeline.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-DistilledPipeline.py)|-|-|-|-| +|[Lightricks/LTX-2.3: A2V](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py)|-|-|-|-| +|[Lightricks/LTX-2.3: Retake](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_video`,`retake_video_regions`,`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py)|-|-|-|-| |[Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)| |[Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|-|-|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)| |[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/full/LTX-2-T2AV-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_full/LTX-2-T2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/lora/LTX-2-T2AV-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_lora/LTX-2-T2AV.py)| diff --git a/docs/zh/Model_Details/LTX-2.md b/docs/zh/Model_Details/LTX-2.md index d1c54961..e2592ddf 100644 --- a/docs/zh/Model_Details/LTX-2.md +++ b/docs/zh/Model_Details/LTX-2.md @@ -117,6 +117,8 @@ write_video_audio_ltx2( |[Lightricks/LTX-2.3: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-OneStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-OneStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/full/LTX-2.3-T2AV-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_full/LTX-2.3-T2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/lora/LTX-2.3-T2AV-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV.py)| |[Lightricks/LTX-2.3: TwoStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage.py)|-|-|-|-| |[Lightricks/LTX-2.3: DistilledPipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2.3)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-DistilledPipeline.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-DistilledPipeline.py)|-|-|-|-| +|[Lightricks/LTX-2.3: A2V](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py)|-|-|-|-| +|[Lightricks/LTX-2.3: Retake](https://www.modelscope.cn/models/Lightricks/LTX-2.3)|`retake_video`,`retake_video_regions`,`retake_audio`,`audio_sample_rate`,`retake_audio_regions`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py)|-|-|-|-| |[Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Union-Control.py)|-|-|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)| |[Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control](https://www.modelscope.cn/models/Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control)|`in_context_videos`,`in_context_downsample_factor`|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-IC-LoRA-Motion-Track-Control.py)|-|-|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/lora/LTX-2.3-T2AV-IC-LoRA-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_lora/LTX-2.3-T2AV-IC-LoRA.py)| |[Lightricks/LTX-2: OneStagePipeline-T2AV](https://www.modelscope.cn/models/Lightricks/LTX-2)||[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference/LTX-2-T2AV-OneStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_inference_low_vram/LTX-2-T2AV-OneStage.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/full/LTX-2-T2AV-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_full/LTX-2-T2AV.py)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/lora/LTX-2-T2AV-splited.sh)|[code](https://github.com/modelscope/DiffSynth-Studio/blob/main/examples/ltx2/model_training/validate_lora/LTX-2-T2AV.py)| diff --git a/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py b/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py index d42dade1..62b88782 100644 --- a/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py +++ b/examples/ltx2/model_inference/LTX-2.3-A2V-TwoStage.py @@ -46,7 +46,7 @@ video, audio = pipe( prompt=prompt, negative_prompt=negative_prompt, - input_audio=audio, + retake_audio=audio, audio_sample_rate=audio_sample_rate, seed=43, height=height, diff --git a/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py b/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py new file mode 100644 index 00000000..e717fe9b --- /dev/null +++ b/examples/ltx2/model_inference/LTX-2.3-T2AV-TwoStage-Retake.py @@ -0,0 +1,76 @@ +import torch +from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig +from diffsynth.utils.data.media_io_ltx2 import read_audio_with_torchaudio, write_video_audio_ltx2 +from modelscope import dataset_snapshot_download +from diffsynth.utils.data import VideoData + +vram_config = { + "offload_dtype": torch.bfloat16, + "offload_device": "cpu", + "onload_dtype": torch.bfloat16, + "onload_device": "cuda", + "preparing_dtype": torch.bfloat16, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = LTX2AudioVideoPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), + stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-distilled-lora-384.safetensors"), +) + +dataset_snapshot_download("DiffSynth-Studio/example_video_dataset", allow_file_pattern="ltx2/*", local_dir="data/example_video_dataset") +prompt = "A beautiful woman with a flower crown is singing happily under a blooming cherry tree. She sings: 'Mummy don't know daddy's getting hot. At the body shop'" +negative_prompt = ( + "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " + "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " + "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, " + "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of " + "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent " + "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny " + "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, " + "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, " + "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward " + "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " + "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." +) + +height, width, num_frames, frame_rate = 512 * 2, 768 * 2, 121, 24 +path = "data/example_video_dataset/ltx2/video2.mp4" +video = VideoData(path, height=height, width=width).raw_data()[:num_frames] +assert len(video) == num_frames, f"Input video has {len(video)} frames, but expected {num_frames} frames based on the specified num_frames argument." +duration = num_frames / frame_rate +audio, audio_sample_rate = read_audio_with_torchaudio(path) + +# Regenerate the video within time regions. You can specify different time regions for video frames and audio retake. +# retake regions are in seconds, and the example below retakes video frames in the time regions of [1s, 2s] and [3s, 4s], and retakes audio in the time regions of [0s, 1s] and [4s, 5s]. +video, audio = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + retake_video=video, + retake_video_regions=[(1, 2), (3, 4)], + retake_audio=audio, + audio_sample_rate=audio_sample_rate, + retake_audio_regions=[(0, 1), (4, 5)], + seed=43, + height=height, + width=width, + num_frames=num_frames, + frame_rate=frame_rate, + tiled=True, + use_two_stage_pipeline=True, +) +write_video_audio_ltx2( + video=video, + audio=audio, + output_path='ltx2.3_twostage_retake.mp4', + fps=frame_rate, + audio_sample_rate=pipe.audio_vocoder.output_sampling_rate, +) diff --git a/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py b/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py new file mode 100644 index 00000000..267a3059 --- /dev/null +++ b/examples/ltx2/model_inference_low_vram/LTX-2.3-A2V-TwoStage.py @@ -0,0 +1,66 @@ +import torch +from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig +from diffsynth.utils.data.media_io_ltx2 import read_audio_with_torchaudio, write_video_audio_ltx2 +from modelscope import dataset_snapshot_download + +vram_config = { + "offload_dtype": torch.float8_e5m2, + "offload_device": "cpu", + "onload_dtype": torch.float8_e5m2, + "onload_device": "cpu", + "preparing_dtype": torch.float8_e5m2, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = LTX2AudioVideoPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), + stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-distilled-lora-384.safetensors"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +dataset_snapshot_download("DiffSynth-Studio/example_video_dataset", allow_file_pattern="ltx2/*", local_dir="data/example_video_dataset") +prompt = "A beautiful woman with a flower crown is singing happily under a blooming cherry tree." +negative_prompt = ( + "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " + "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " + "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, " + "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of " + "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent " + "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny " + "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, " + "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, " + "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward " + "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " + "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." +) +height, width, num_frames, frame_rate = 512 * 2, 768 * 2, 121, 24 +duration = num_frames / frame_rate +audio, audio_sample_rate = read_audio_with_torchaudio("data/example_video_dataset/ltx2/sing.MP3", start_time=1, duration=duration) +video, audio = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + retake_audio=audio, + audio_sample_rate=audio_sample_rate, + seed=43, + height=height, + width=width, + num_frames=num_frames, + frame_rate=frame_rate, + tiled=True, + use_two_stage_pipeline=True, +) +write_video_audio_ltx2( + video=video, + audio=audio, + output_path='ltx2.3_twostage_a2v.mp4', + fps=frame_rate, + audio_sample_rate=pipe.audio_vocoder.output_sampling_rate, +) diff --git a/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py b/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py new file mode 100644 index 00000000..7eb1e7eb --- /dev/null +++ b/examples/ltx2/model_inference_low_vram/LTX-2.3-T2AV-TwoStage-Retake.py @@ -0,0 +1,77 @@ +import torch +from diffsynth.pipelines.ltx2_audio_video import LTX2AudioVideoPipeline, ModelConfig +from diffsynth.utils.data.media_io_ltx2 import read_audio_with_torchaudio, write_video_audio_ltx2 +from modelscope import dataset_snapshot_download +from diffsynth.utils.data import VideoData + +vram_config = { + "offload_dtype": torch.float8_e5m2, + "offload_device": "cpu", + "onload_dtype": torch.float8_e5m2, + "onload_device": "cpu", + "preparing_dtype": torch.float8_e5m2, + "preparing_device": "cuda", + "computation_dtype": torch.bfloat16, + "computation_device": "cuda", +} +pipe = LTX2AudioVideoPipeline.from_pretrained( + torch_dtype=torch.bfloat16, + device="cuda", + model_configs=[ + ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors", **vram_config), + ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors", **vram_config), + ], + tokenizer_config=ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized"), + stage2_lora_config=ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-distilled-lora-384.safetensors"), + vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5, +) + +dataset_snapshot_download("DiffSynth-Studio/example_video_dataset", allow_file_pattern="ltx2/*", local_dir="data/example_video_dataset") +prompt = "A beautiful woman with a flower crown is singing happily under a blooming cherry tree. She sings: 'Mummy don't know daddy's getting hot. At the body shop'" +negative_prompt = ( + "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " + "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " + "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, " + "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of " + "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent " + "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny " + "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, " + "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, " + "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward " + "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " + "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." +) + +height, width, num_frames, frame_rate = 512 * 2, 768 * 2, 121, 24 +path = "data/example_video_dataset/ltx2/video2.mp4" +video = VideoData(path, height=height, width=width).raw_data()[:num_frames] +assert len(video) == num_frames, f"Input video has {len(video)} frames, but expected {num_frames} frames based on the specified num_frames argument." +duration = num_frames / frame_rate +audio, audio_sample_rate = read_audio_with_torchaudio(path) + +# Regenerate the video within time regions. You can specify different time regions for video frames and audio retake. +# retake regions are in seconds, and the example below retakes video frames in the time regions of [1s, 2s] and [3s, 4s], and retakes audio in the time regions of [0s, 1s] and [4s, 5s]. +video, audio = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + retake_video=video, + retake_video_regions=[(1, 2), (3, 4)], + retake_audio=audio, + audio_sample_rate=audio_sample_rate, + retake_audio_regions=[(0, 1), (4, 5)], + seed=43, + height=height, + width=width, + num_frames=num_frames, + frame_rate=frame_rate, + tiled=True, + use_two_stage_pipeline=True, +) +write_video_audio_ltx2( + video=video, + audio=audio, + output_path='ltx2.3_twostage_retake.mp4', + fps=frame_rate, + audio_sample_rate=pipe.audio_vocoder.output_sampling_rate, +) From c33334968a487e2014219bad22c2846439d9a1b2 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Wed, 11 Mar 2026 20:30:37 +0800 Subject: [PATCH 4/5] add news --- README.md | 2 ++ README_zh.md | 1 + diffsynth/pipelines/ltx2_audio_video.py | 2 +- diffsynth/utils/data/media_io_ltx2.py | 9 ++++++++- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3db16fe3..66dcefab 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ We believe that a well-developed open-source code framework can lower the thresh > Currently, the development personnel of this project are limited, with most of the work handled by [Artiprocher](https://github.com/Artiprocher). Therefore, the progress of new feature development will be relatively slow, and the speed of responding to and resolving issues is limited. We apologize for this and ask developers to understand. +- **March 12, 2026**: We have added support for the [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) audio-video generation model. The features includes text-to-audio/video, image-to-audio/video, IC-LoRA control, audio-to-video, and audio-video inpainting. We have supported the complete inference and training functionalities. For details, please refer to the [documentation](/docs/zh/Model_Details/LTX-2.md) and [code](/examples/ltx2/). + - **March 3, 2026**: We released the [DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2) model, which is an updated version of Qwen-Image-Layered-Control. In addition to the originally supported text-guided functionality, it adds brush-controlled layer separation capabilities. - **March 2, 2026** Added support for [Anima](https://modelscope.cn/models/circlestone-labs/Anima). For details, please refer to the [documentation](docs/en/Model_Details/Anima.md). This is an interesting anime-style image generation model. We look forward to its future updates. diff --git a/README_zh.md b/README_zh.md index fbc8aaac..c7feb85f 100644 --- a/README_zh.md +++ b/README_zh.md @@ -33,6 +33,7 @@ DiffSynth 目前包括两个开源项目: > 目前本项目的开发人员有限,大部分工作由 [Artiprocher](https://github.com/Artiprocher) 负责,因此新功能的开发进展会比较缓慢,issue 的回复和解决速度有限,我们对此感到非常抱歉,请各位开发者理解。 +- **2026年3月12日** 我们新增了 [LTX-2.3](https://modelscope.cn/models/Lightricks/LTX-2.3) 音视频生成模型的支持,模型支持的功能包括文生音视频、图生音视频、IC-LoRA控制、音频生视频、音视频局部Inpainting,框架支持完整的推理和训练功能。详细信息请参考 [文档](/docs/zh/Model_Details/LTX-2.md) 和 [示例代码](/examples/ltx2/)。 - **2026年3月3日** 我们发布了 [DiffSynth-Studio/Qwen-Image-Layered-Control-V2](https://www.modelscope.cn/models/DiffSynth-Studio/Qwen-Image-Layered-Control-V2) 模型,这是 Qwen-Image-Layered-Control 的更新版本。除了原本就支持的文本引导功能,新增了画笔控制的图层拆分能力。 - **2026年3月2日** 新增对[Anima](https://modelscope.cn/models/circlestone-labs/Anima)的支持,详见[文档](docs/zh/Model_Details/Anima.md)。这是一个有趣的动漫风格图像生成模型,我们期待其后续的模型更新。 diff --git a/diffsynth/pipelines/ltx2_audio_video.py b/diffsynth/pipelines/ltx2_audio_video.py index e2cd3453..a06213a6 100644 --- a/diffsynth/pipelines/ltx2_audio_video.py +++ b/diffsynth/pipelines/ltx2_audio_video.py @@ -432,7 +432,7 @@ class LTX2AudioVideoUnit_AudioRetakeEmbedder(PipelineUnit): def __init__(self): super().__init__( input_params=("retake_audio", "seed", "rand_device", "retake_audio_regions"), - output_params=("input_latents_audio", "audio_noise", "audio_positions", "audio_latent_shape", "denoise_mask_audio", "audio_latents"), + output_params=("input_latents_audio", "audio_noise", "audio_positions", "audio_latent_shape", "denoise_mask_audio"), onload_model_names=("audio_vae_encoder",) ) diff --git a/diffsynth/utils/data/media_io_ltx2.py b/diffsynth/utils/data/media_io_ltx2.py index 90aed07d..31f2d6ad 100644 --- a/diffsynth/utils/data/media_io_ltx2.py +++ b/diffsynth/utils/data/media_io_ltx2.py @@ -78,6 +78,7 @@ def _prepare_audio_stream(container: av.container.Container, audio_sample_rate: audio_stream.codec_context.time_base = Fraction(1, best_rate) return audio_stream + def write_video_audio_ltx2( video: list[Image.Image], audio: torch.Tensor | None, @@ -144,7 +145,13 @@ def resample_waveform(waveform: torch.Tensor, source_rate: int, target_rate: int return resampled.to(dtype=waveform.dtype) -def read_audio_with_torchaudio(path: str, start_time: float = 0, duration: float | None = None, resample: bool = False, resample_rate: int = 48000) -> torch.Tensor: +def read_audio_with_torchaudio( + path: str, + start_time: float = 0, + duration: float | None = None, + resample: bool = False, + resample_rate: int = 48000, +) -> tuple[torch.Tensor, int]: waveform, sample_rate = torchaudio.load(path, channels_first=True) if resample: waveform = resample_waveform(waveform, sample_rate, resample_rate) From 13562354ba809aa1cfbfcbbfac8772a0d7cb4703 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Thu, 12 Mar 2026 10:19:42 +0800 Subject: [PATCH 5/5] minor fix --- diffsynth/utils/data/media_io_ltx2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffsynth/utils/data/media_io_ltx2.py b/diffsynth/utils/data/media_io_ltx2.py index 31f2d6ad..c31b0507 100644 --- a/diffsynth/utils/data/media_io_ltx2.py +++ b/diffsynth/utils/data/media_io_ltx2.py @@ -159,7 +159,7 @@ def read_audio_with_torchaudio( start_frame = int(start_time * sample_rate) if start_frame > waveform.shape[-1]: raise ValueError(f"start_time of {start_time} exceeds max duration of {waveform.shape[-1] / sample_rate:.2f}") - end_frame = -1 if duration is None else int(duration * sample_rate + start_frame) + end_frame = None if duration is None else int(duration * sample_rate + start_frame) return waveform[..., start_frame:end_frame], sample_rate