wan2-2-fp8da-aoti-preview

Running on Zero

App Files Files Community

Update app.py

by Aza72 - opened Feb 7

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+81

-13

Files changed (1) hide show

app.py +81 -13

app.py CHANGED Viewed

@@ -15,6 +15,8 @@ from tqdm import tqdm
 import cv2
 import numpy as np
 import torch
 from torch.nn import functional as F
 from PIL import Image
@@ -231,9 +233,30 @@ def interpolate_bits(frames_np, multiplier=2, scale=1.0):
 # WAN
-MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
 CACHE_DIR = os.path.expanduser("~/.cache/huggingface/")
 MAX_DIM = 832
 MIN_DIM = 480
 SQUARE_DIM = 640
@@ -258,11 +281,43 @@ SCHEDULER_MAP = {
 }
 pipe = WanImageToVideoPipeline.from_pretrained(
-    "TestOrganizationPleaseIgnore/WAMU_v2_WAN2.2_I2V_LIGHTNING",
     torch_dtype=torch.bfloat16,
 ).to('cuda')
 original_scheduler = copy.deepcopy(pipe.scheduler)
 if os.path.exists(CACHE_DIR):
     shutil.rmtree(CACHE_DIR)
     print("Deleted Hugging Face cache.")
@@ -270,8 +325,11 @@ else:
     print("No hub cache found.")
 quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
 quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
 quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
 aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
 aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
@@ -283,10 +341,13 @@ default_prompt_i2v = "make this image come alive, cinematic motion, smooth anima
 default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
 def resize_image(image: Image.Image) -> Image.Image:
-    """
-    Resizes an image to fit within the model's constraints, preserving aspect ratio as much as possible.
-    """
     width, height = image.size
     if width == height:
         return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS)
@@ -322,7 +383,6 @@ def resize_image(image: Image.Image) -> Image.Image:
 def resize_and_crop_to_match(target_image, reference_image):
-    """Resizes and center-crops the target image to match the reference image's dimensions."""
     ref_width, ref_height = reference_image.size
     target_width, target_height = target_image.size
     scale = max(ref_width / target_width, ref_height / target_height)
@@ -406,7 +466,7 @@ def run_inference(
     clear_vram()
     task_name = str(uuid.uuid4())[:8]
-    print(f"Task: {task_name}, {duration_seconds}, {resized_image.size}, FM={frame_multiplier}")
     start = time.time()
     result = pipe(
         image=resized_image,
@@ -422,6 +482,7 @@ def run_inference(
         generator=torch.Generator(device="cuda").manual_seed(current_seed),
         output_type="np"
     )
     raw_frames_np = result.frames[0]  # Returns (T, H, W, C) float32
     pipe.scheduler = original_scheduler
@@ -429,9 +490,11 @@ def run_inference(
     frame_factor = frame_multiplier // FIXED_FPS
     if frame_factor > 1:
         start = time.time()
         rife_model.device()
         rife_model.flownet = rife_model.flownet.half()
         final_frames = interpolate_bits(raw_frames_np, multiplier=int(frame_factor))
     else:
         final_frames = list(raw_frames_np)
@@ -445,6 +508,7 @@ def run_inference(
         pbar.update(2)
         export_to_video(final_frames, video_path, fps=final_fps, quality=quality)
         pbar.update(1)
     return video_path, task_name
@@ -559,10 +623,9 @@ CSS = """
 """
-with gr.Blocks(theme=gr.themes.Soft(), css=CSS, delete_cache=(3600, 10800)) as demo:
-    gr.Markdown("## WAMU V2 - Wan 2.2 I2V (14B) 🐢🐢")
     gr.Markdown("#### ℹ️ **A Note on Performance:** This version prioritizes a straightforward setup over maximum speed, so performance may vary.")
-    gr.Markdown('Try the previous version: [WAMU v1](https://huggingface.co/spaces/r3gm/wan2-2-fp8da-aoti-preview2)')
     gr.Markdown("Run Wan 2.2 in just 4-8 steps, fp8 quantization & AoT compilation - compatible with 🧨 diffusers and ZeroGPU")
     with gr.Row():
@@ -571,7 +634,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CSS, delete_cache=(3600, 10800)) as d
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
             duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
             frame_multi = gr.Dropdown(
-                choices=[FIXED_FPS, FIXED_FPS*2, FIXED_FPS*4],
                 value=FIXED_FPS,
                 label="Video Fluidity (Frames per Second)",
                 info="Extra frames will be generated using flow estimation, which estimates motion between frames to make the video smoother."
@@ -593,12 +656,17 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CSS, delete_cache=(3600, 10800)) as d
                 )
                 flow_shift_slider = gr.Slider(minimum=0.5, maximum=15.0, step=0.1, value=3.0, label="Flow Shift")
                 play_result_video = gr.Checkbox(label="Display result", value=True, interactive=True)
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             # ASSIGNED elem_id="generated-video" so JS can find it
-            video_output = gr.Video(label="Generated Video", autoplay=True, sources=["upload"], show_download_button=True, show_share_button=True, interactive=False, elem_id="generated-video")
             # --- Frame Grabbing UI ---
             with gr.Row():
@@ -641,6 +709,6 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CSS, delete_cache=(3600, 10800)) as d
 if __name__ == "__main__":
     demo.queue().launch(
         mcp_server=True,
-        ssr_mode=False,
         show_error=True,
     )

 import cv2
 import numpy as np
 import torch
+import torch._dynamo
+from huggingface_hub import list_models
 from torch.nn import functional as F
 from PIL import Image
 # WAN
+ORG_NAME = "TestOrganizationPleaseIgnore"
+# MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
+MODEL_ID = os.getenv("REPO_ID") or random.choice(
+    list(list_models(author=ORG_NAME, filter='diffusers:WanImageToVideoPipeline'))
+).modelId
 CACHE_DIR = os.path.expanduser("~/.cache/huggingface/")
+LORA_MODELS = [
+    # {
+    #     "repo_id": "exampleuser/example_lora_1",
+    #     "high_tr": "example_lora_1_high.safetensors",
+    #     "low_tr": "example_lora_1_low.safetensors",
+    #     "high_scale": 0.5,
+    #     "low_scale": 0.5
+    # },
+    # {
+    #     "repo_id": "exampleuser/example_lora_2",
+    #     "high_tr": "subfolder/example_lora_2_high.safetensors",
+    #     "low_tr": "subfolder/example_lora_2_low.safetensors",
+    #     "high_scale": 0.4,
+    #     "low_scale": 0.4
+    # },
+]
 MAX_DIM = 832
 MIN_DIM = 480
 SQUARE_DIM = 640
 }
 pipe = WanImageToVideoPipeline.from_pretrained(
+    MODEL_ID,
     torch_dtype=torch.bfloat16,
 ).to('cuda')
 original_scheduler = copy.deepcopy(pipe.scheduler)
+for i, lora in enumerate(LORA_MODELS):
+    name_high_tr = lora["high_tr"].split(".")[0].split("/")[-1] + "Hh"
+    name_low_tr = lora["low_tr"].split(".")[0].split("/")[-1] + "Ll"
+    try:
+        pipe.load_lora_weights(
+            lora["repo_id"],
+            weight_name=lora["high_tr"],
+            adapter_name=name_high_tr
+        )
+        kwargs_lora = {"load_into_transformer_2": True}
+        pipe.load_lora_weights(
+            lora["repo_id"],
+            weight_name=lora["low_tr"],
+            adapter_name=name_low_tr,
+            **kwargs_lora
+        )
+        pipe.set_adapters([name_high_tr, name_low_tr], adapter_weights=[1.0, 1.0])
+        pipe.fuse_lora(adapter_names=[name_high_tr], lora_scale=lora["high_scale"], components=["transformer"])
+        pipe.fuse_lora(adapter_names=[name_low_tr], lora_scale=lora["low_scale"], components=["transformer_2"])
+        pipe.unload_lora_weights()
+        print(f"Applied: {lora['high_tr']}, hs={lora['high_scale']}/ls={lora['low_scale']}, {i+1}/{len(LORA_MODELS)}")
+    except Exception as e:
+        print("Error:", str(e))
+        print("Failed LoRA:", name_high_tr)
+        pipe.unload_lora_weights()
 if os.path.exists(CACHE_DIR):
     shutil.rmtree(CACHE_DIR)
     print("Deleted Hugging Face cache.")
     print("No hub cache found.")
 quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
+torch._dynamo.reset()
 quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
+torch._dynamo.reset()
 quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
+torch._dynamo.reset()
 aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
 aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
 default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
+def model_title():
+    repo_name = MODEL_ID.split('/')[-1].replace("_", " ")
+    url = f"https://huggingface.co/{MODEL_ID}"
+    return f"## This space is currently running [{repo_name}]({url}) 🐢"
 def resize_image(image: Image.Image) -> Image.Image:
     width, height = image.size
     if width == height:
         return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS)
 def resize_and_crop_to_match(target_image, reference_image):
     ref_width, ref_height = reference_image.size
     target_width, target_height = target_image.size
     scale = max(ref_width / target_width, ref_height / target_height)
     clear_vram()
     task_name = str(uuid.uuid4())[:8]
+    print(f"Generating {num_frames} frames, task: {task_name}, {duration_seconds}, {resized_image.size}")
     start = time.time()
     result = pipe(
         image=resized_image,
         generator=torch.Generator(device="cuda").manual_seed(current_seed),
         output_type="np"
     )
+    print("gen time passed:", time.time() - start)
     raw_frames_np = result.frames[0]  # Returns (T, H, W, C) float32
     pipe.scheduler = original_scheduler
     frame_factor = frame_multiplier // FIXED_FPS
     if frame_factor > 1:
         start = time.time()
+        print(f"Processing frames (RIFE Multiplier: {frame_factor}x)...")
         rife_model.device()
         rife_model.flownet = rife_model.flownet.half()
         final_frames = interpolate_bits(raw_frames_np, multiplier=int(frame_factor))
+        print("Interpolation time passed:", time.time() - start)
     else:
         final_frames = list(raw_frames_np)
         pbar.update(2)
         export_to_video(final_frames, video_path, fps=final_fps, quality=quality)
         pbar.update(1)
+    print(f"Export time passed, {final_fps} FPS:", time.time() - start)
     return video_path, task_name
 """
+with gr.Blocks(delete_cache=(3600, 10800)) as demo:
+    gr.Markdown(model_title())
     gr.Markdown("#### ℹ️ **A Note on Performance:** This version prioritizes a straightforward setup over maximum speed, so performance may vary.")
     gr.Markdown("Run Wan 2.2 in just 4-8 steps, fp8 quantization & AoT compilation - compatible with 🧨 diffusers and ZeroGPU")
     with gr.Row():
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
             duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
             frame_multi = gr.Dropdown(
+                choices=[FIXED_FPS, FIXED_FPS*2, FIXED_FPS*4, FIXED_FPS*8],
                 value=FIXED_FPS,
                 label="Video Fluidity (Frames per Second)",
                 info="Extra frames will be generated using flow estimation, which estimates motion between frames to make the video smoother."
                 )
                 flow_shift_slider = gr.Slider(minimum=0.5, maximum=15.0, step=0.1, value=3.0, label="Flow Shift")
                 play_result_video = gr.Checkbox(label="Display result", value=True, interactive=True)
+                gr.Markdown(f"[ZeroGPU help, tips and troubleshooting](https://huggingface.co/datasets/{ORG_NAME}/help/blob/main/gpu_help.md)")
+                gr.Markdown(  # TestOrganizationPleaseIgnore/wamu-tools
+                    "To use a different model, **duplicate this Space** first, then change the `REPO_ID` environment variable. "
+                    "[See compatible models here](https://huggingface.co/models?other=diffusers:WanImageToVideoPipeline&sort=trending&search=WAN2.2_I2V_LIGHTNING)."
+                )
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             # ASSIGNED elem_id="generated-video" so JS can find it
+            video_output = gr.Video(label="Generated Video", autoplay=True, sources=["upload"], buttons=["download", "share"], interactive=True, elem_id="generated-video")
             # --- Frame Grabbing UI ---
             with gr.Row():
 if __name__ == "__main__":
     demo.queue().launch(
         mcp_server=True,
+        css=CSS,
         show_error=True,
     )