LehongWu commited on
Commit
6cc3d86
·
verified ·
1 Parent(s): f8d9f81

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .dockerignore +14 -0
  3. .gitattributes +15 -0
  4. .gitignore +3 -0
  5. Dockerfile +33 -0
  6. README.md +30 -6
  7. __pycache__/gen_image_from_prompt.cpython-312.pyc +0 -0
  8. __pycache__/gen_image_prompt_only.cpython-312.pyc +0 -0
  9. __pycache__/gen_image_same_start_end.cpython-312.pyc +0 -0
  10. __pycache__/gen_prompt_only.cpython-312.pyc +0 -0
  11. __pycache__/gen_video_image_start_end.cpython-312.pyc +0 -0
  12. __pycache__/gen_video_prompt_only.cpython-312.pyc +0 -0
  13. __pycache__/generate_video.cpython-312.pyc +0 -0
  14. assets/example_1_prompt_to_image/output_a.png +3 -0
  15. assets/example_1_prompt_to_image/output_b.png +3 -0
  16. assets/example_2_image_to_image/input.png +3 -0
  17. assets/example_2_image_to_image/output.png +3 -0
  18. assets/example_3a_loop_video/first_last_frame.png +3 -0
  19. assets/example_3a_loop_video/output.mp4 +3 -0
  20. assets/example_3b_loop_video/first_last_frame.png +3 -0
  21. assets/example_3b_loop_video/output.mp4 +3 -0
  22. assets/example_4_super_res/input.png +0 -0
  23. assets/example_4_super_res/output_4k.png +3 -0
  24. assets/example_5_video_extension/output_a.mp4 +3 -0
  25. assets/example_5_video_extension/output_b.mp4 +3 -0
  26. docs/README.md +8 -0
  27. docs/SPEC_WEB_UI.md +113 -0
  28. docs/WEB_DEV_GUIDE.md +160 -0
  29. gen_image_image_cond.py +201 -0
  30. gen_image_prompt_only.py +165 -0
  31. gen_lyrics_batch.py +156 -0
  32. gen_video_image_start_end.py +194 -0
  33. gen_video_prompt_only.py +155 -0
  34. gen_video_prompt_only_extend.py +281 -0
  35. generate_lyrics.sh +46 -0
  36. generate_lyrics_batch.sh +48 -0
  37. image_super_resolution.sh +45 -0
  38. run_gen_image_image_cond.sh +42 -0
  39. run_gen_image_prompt_only.sh +36 -0
  40. run_gen_video_image_start_end.sh +30 -0
  41. run_gen_video_image_start_end_diff.sh +38 -0
  42. run_gen_video_prompt_only.sh +35 -0
  43. run_gen_video_prompt_only_extend.sh +49 -0
  44. run_gen_video_prompt_only_extend_2.sh +48 -0
  45. web/__init__.py +1 -0
  46. web/__pycache__/__init__.cpython-312.pyc +0 -0
  47. web/backend/__init__.py +1 -0
  48. web/backend/__pycache__/__init__.cpython-312.pyc +0 -0
  49. web/backend/__pycache__/config.cpython-312.pyc +0 -0
  50. web/backend/__pycache__/deps.cpython-312.pyc +0 -0
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.dockerignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ **/__pycache__
4
+ **/*.py[cod]
5
+ .venv
6
+ venv
7
+ .env
8
+ .env.*
9
+
10
+ # Rebuilt inside the image; omit host bundle from context
11
+ web/backend/static
12
+
13
+ web/frontend/node_modules
14
+ **/node_modules
.gitattributes CHANGED
@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/example_1_prompt_to_image/output_a.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/example_1_prompt_to_image/output_b.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/example_2_image_to_image/input.png filter=lfs diff=lfs merge=lfs -text
39
+ assets/example_2_image_to_image/output.png filter=lfs diff=lfs merge=lfs -text
40
+ assets/example_3a_loop_video/first_last_frame.png filter=lfs diff=lfs merge=lfs -text
41
+ assets/example_3a_loop_video/output.mp4 filter=lfs diff=lfs merge=lfs -text
42
+ assets/example_3b_loop_video/first_last_frame.png filter=lfs diff=lfs merge=lfs -text
43
+ assets/example_3b_loop_video/output.mp4 filter=lfs diff=lfs merge=lfs -text
44
+ assets/example_4_super_res/output_4k.png filter=lfs diff=lfs merge=lfs -text
45
+ assets/example_5_video_extension/output_a.mp4 filter=lfs diff=lfs merge=lfs -text
46
+ assets/example_5_video_extension/output_b.mp4 filter=lfs diff=lfs merge=lfs -text
47
+ web/frontend/node_modules/@esbuild/darwin-arm64/bin/esbuild filter=lfs diff=lfs merge=lfs -text
48
+ web/frontend/node_modules/@rollup/rollup-darwin-arm64/rollup.darwin-arm64.node filter=lfs diff=lfs merge=lfs -text
49
+ web/frontend/node_modules/esbuild/bin/esbuild filter=lfs diff=lfs merge=lfs -text
50
+ web/frontend/node_modules/fsevents/fsevents.node filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ examples
2
+ output
3
+ .venv
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ # Hugging Face Spaces: sdk: docker, default port 7860 (override with PORT).
3
+ # Build from repo root (directory that contains web/ and assets/).
4
+
5
+ FROM node:20-bookworm-slim AS frontend-build
6
+ WORKDIR /app/web/frontend
7
+ COPY web/frontend/package.json web/frontend/package-lock.json ./
8
+ RUN npm ci
9
+ COPY web/frontend/ ./
10
+ RUN npm run build
11
+
12
+ FROM python:3.12-slim-bookworm
13
+ WORKDIR /app
14
+
15
+ ENV PYTHONDONTWRITEBYTECODE=1 \
16
+ PYTHONUNBUFFERED=1 \
17
+ PYTHONPATH=/app
18
+
19
+ RUN apt-get update \
20
+ && apt-get install -y --no-install-recommends ffmpeg \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ COPY web/requirements.txt /app/web/requirements.txt
24
+ RUN pip install --no-cache-dir -r /app/web/requirements.txt
25
+
26
+ COPY web /app/web
27
+ COPY assets /app/assets
28
+
29
+ COPY --from=frontend-build /app/web/backend/static /app/web/backend/static
30
+
31
+ EXPOSE 7860
32
+
33
+ CMD ["sh", "-c", "exec uvicorn web.backend.main:app --host 0.0.0.0 --port ${PORT:-7860}"]
README.md CHANGED
@@ -1,11 +1,35 @@
1
  ---
2
- title: VideoGeneration Release
3
- emoji: 🏃
4
- colorFrom: purple
5
- colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Gemini Studio Web
3
+ emoji: 🎨
4
+ colorFrom: gray
5
+ colorTo: indigo
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
+ # Gemini Studio Web
11
+
12
+ Gemini 图片 / 视频创作台(FastAPI + React)。本仓库根目录的 **`README.md`** 是**总索引**(并供 Hugging Face Spaces 读取 YAML 元数据);具体说明拆到 `docs/` 下,避免与「给实现者的需求文档」「给开发者的运行手册」混在一起。
13
+
14
+ ## Documentation
15
+
16
+ | Document | Audience | Contents |
17
+ |----------|----------|----------|
18
+ | [**docs/SPEC_WEB_UI.md**](docs/SPEC_WEB_UI.md) | 产品 / 实现者 / AI | 功能范围、界面与示例页要求、非技术约束(原 `PLAN.md`) |
19
+ | [**docs/WEB_DEV_GUIDE.md**](docs/WEB_DEV_GUIDE.md) | 本机与部署的开发者 | 环境、`PYTHONPATH`、环境变量、`generation_options.json`、本地运行、稳定 URL、Docker、Hugging Face(原 `web/README.md`) |
20
+
21
+ 在 GitHub 里浏览 `docs/` 文件夹时,可先打开 **[docs/README.md](docs/README.md)**(仅索引,内容与上表一致)。
22
+
23
+ ---
24
+
25
+ ## Hugging Face Space
26
+
27
+ 部署到 Space 后,在 **Settings → Variables and secrets** 中配置(名称区分大小写)。保存后 Space 会重启;首次冷启动可能需一两分钟。
28
+
29
+ | Name | 说明 |
30
+ |------|------|
31
+ | `GEMINI_API_KEY` | Google AI Studio / Gemini API 密钥(仅服务端使用) |
32
+ | `WEB_UI_PASSWORD` | 登录本站时输入的密码 |
33
+ | `SESSION_SECRET` | 会话签名用随机串,例如本地执行 `openssl rand -hex 32` 生成 |
34
+
35
+ 更完整的步骤与 `docker run` 自测见 **[docs/WEB_DEV_GUIDE.md §10](docs/WEB_DEV_GUIDE.md#10-hugging-face-spaces-docker)**。可选变量 `GENERATION_OPTIONS_PATH` 等见该文档 **§3–4**。
__pycache__/gen_image_from_prompt.cpython-312.pyc ADDED
Binary file (6.86 kB). View file
 
__pycache__/gen_image_prompt_only.cpython-312.pyc ADDED
Binary file (5.56 kB). View file
 
__pycache__/gen_image_same_start_end.cpython-312.pyc ADDED
Binary file (7.17 kB). View file
 
__pycache__/gen_prompt_only.cpython-312.pyc ADDED
Binary file (4.9 kB). View file
 
__pycache__/gen_video_image_start_end.cpython-312.pyc ADDED
Binary file (7.26 kB). View file
 
__pycache__/gen_video_prompt_only.cpython-312.pyc ADDED
Binary file (6.01 kB). View file
 
__pycache__/generate_video.cpython-312.pyc ADDED
Binary file (5.42 kB). View file
 
assets/example_1_prompt_to_image/output_a.png ADDED

Git LFS Details

  • SHA256: 54e5fdc9947655adb42c0ac6d08ce4849086e3a717826cd20b651525e5c8dba1
  • Pointer size: 131 Bytes
  • Size of remote file: 561 kB
assets/example_1_prompt_to_image/output_b.png ADDED

Git LFS Details

  • SHA256: dcd66058f8b1834eb25351f15fac15397610b44a34840c8a4a79de3de5024cb0
  • Pointer size: 131 Bytes
  • Size of remote file: 562 kB
assets/example_2_image_to_image/input.png ADDED

Git LFS Details

  • SHA256: 411d4d81339d11cb9916c78926423203f2a50157c8bd779189fa5b5569e8689f
  • Pointer size: 131 Bytes
  • Size of remote file: 450 kB
assets/example_2_image_to_image/output.png ADDED

Git LFS Details

  • SHA256: 8e93a25ea80882d1a00388a7b907bf55e9a5d442dbfdcc9d694cf84e0d7d31bf
  • Pointer size: 133 Bytes
  • Size of remote file: 40.3 MB
assets/example_3a_loop_video/first_last_frame.png ADDED

Git LFS Details

  • SHA256: cc0dc8afdcdb81ac92ed72cb48531cd5a1f56652ed9eb3c126e6d135e4584a83
  • Pointer size: 131 Bytes
  • Size of remote file: 584 kB
assets/example_3a_loop_video/output.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c324a665d3dc84fd4c1df27d0c82283a51b7c96e300a9e34059eb114bc24a753
3
+ size 30184274
assets/example_3b_loop_video/first_last_frame.png ADDED

Git LFS Details

  • SHA256: f1bd09d1fbc0f118d934c724283bf58185c01f7f694bb844f318df5fdf4f33c9
  • Pointer size: 131 Bytes
  • Size of remote file: 383 kB
assets/example_3b_loop_video/output.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08cfdafd9665b2a8f3d6d9b953b9cb0c79961a73e1fa9818c6f0af7e827f8e51
3
+ size 9509577
assets/example_4_super_res/input.png ADDED
assets/example_4_super_res/output_4k.png ADDED

Git LFS Details

  • SHA256: ff3c8e6a39a1de32d0c0909c2c1776deec98ba1ec2406f0246d3f8c441a894af
  • Pointer size: 132 Bytes
  • Size of remote file: 3.93 MB
assets/example_5_video_extension/output_a.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19cdca400d87057f9097ff2637fcd73b4bdbb4f4f9737cd473c6c5bf6990bca4
3
+ size 9074282
assets/example_5_video_extension/output_b.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17091cc192dbc35f4db48f2452d0c6a52bdca3d83ca225c947ad04ad38516232
3
+ size 9567422
docs/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Documentation
2
+
3
+ | File | Role |
4
+ |------|------|
5
+ | [SPEC_WEB_UI.md](./SPEC_WEB_UI.md) | Product / implementation specification |
6
+ | [WEB_DEV_GUIDE.md](./WEB_DEV_GUIDE.md) | Developer runbook (local run, env, Docker, Hugging Face) |
7
+
8
+ Hub (overview + HF secrets summary): **[README.md](../README.md)**.
docs/SPEC_WEB_UI.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web UI — product specification
2
+
3
+ This file is the **authoring / product spec** for turning the repo into a web UI: what to build, feature scope, and UX expectations. It is aimed at **implementers and AI assistants**. For how to run the stack, env vars, and deployment, see **[`WEB_DEV_GUIDE.md`](./WEB_DEV_GUIDE.md)**.
4
+
5
+ ---
6
+
7
+ This is the initial plan for how to change this pure code-based repo to a web ui.
8
+
9
+ # Overview
10
+ The final version of this repo should be launched as a web ui, which supports image and video generation.
11
+ The user might upload prompts and images (optional) as condition.
12
+ There will be several main features:
13
+
14
+ ## 第一板块:AI创作台
15
+ - A. 图片生成或编辑:提供0-3张参考图片和提示词,生成一张图
16
+ - 思考强度:界面**并列三项**(模型 + 强度合一)——Flash(快速,默认 minimal)、Flash(快速)(长思考,high)、Pro(标准)(长思考,high);对应 `gemini-3.1-flash-image-preview` / `gemini-3-pro-image-preview` + `thinking_level`
17
+ - 宽高比:"1:1","2:3","3:2","3:4","4:3","4:5","5:4","9:16","16:9","21:9"
18
+ - 分辨率:"1K", "2K", "4K"
19
+ - B. 视频生成:提供0-3张参考图片和提示词,生成一个短视频
20
+ - 模型(可配置,见 `generation_options.json`):`veo-3.1-generate-preview`、`veo-3.1-lite-generate-preview`、`veo-3.1-fast-generate-preview`;界面标注为 **(标准)/(轻量)/(快速)**;其中 **Lite 不支持参考图**(由 `supports_reference_images` 标注)
21
+ - 宽高比:16:9 或 9:16
22
+ - 分辨率:720p、1080p 或 4k
23
+ - 时长:纯文案时可 4/6/8 秒(与分辨率组合以 API 为准);**有参考图时固定 8 秒**(Veo 接口要求)
24
+ - C. 视频生成(首尾过渡)(起始/可选结尾帧):至少 1 张起始帧 + 提示词,生成一个短视频;结尾帧可选,或勾选「结尾与起始相同」
25
+ - 模型:与 B 相同,同上三项 Veo 预览模型,可配置
26
+ - 宽高比:16:9 或 9:16
27
+ - 分辨率:720p、1080p 或 4k
28
+ - 时长:固定 **8 秒**(首/尾帧条件时 Veo 不接受 4/6 秒,与纯文案视频不同)
29
+
30
+
31
+ ## 第二板块:辅助工具
32
+
33
+ - 超分辨率:
34
+ - 内在调用与「图片编辑或生成」相同,使用 1 张参考图生成更高清图片
35
+ - 提示词**默认**为「保持内容完全不变,提高图片的分辨率」,**可修改**
36
+ - **先上传原图(必填)**;根据原图**宽高比**在配置列表中**自动择近匹配**;**宽高比选项放在表单最下**,可手动改
37
+ - **分辨率**默认 4K,**不**根据原图自动推断,用户自选 1K/2K/4K
38
+ - 若原图宽高比与列表中**任一项都不接近**,界面**警告**:生成图可能与原图不完全一致,仍可点击生成
39
+ - 模型**默认「快速」**,可改为「标准」
40
+
41
+ - 提取视频的特定帧:
42
+ - 用户上传一个视频
43
+ - 视频将出现一个胶片一样可以拉动的进度条,用户随便停在一个位置,展示具体的时间和对应图像的preview
44
+ - 一旦点击“下载”,在下方讲出现这一帧作为一张单独的图片,可供用户下载。
45
+
46
+ - 图像裁剪(**前端 Canvas**,`/tools/crop`,不上传服务器)
47
+ - 上传图片,**交互式裁剪框**(框内平移、四角缩放;「自由」下四边为**可见白条**可拖,命中区与光标反馈按画布像素计算)
48
+ - 可选固定比例:与 `generation_options.json` 中**图片宽高比**列表一致(另加「自由」);切换比例时重置为居中最大适配框
49
+ - 裁剪交互区即原图;裁剪结果随框**实时**更新,下载 PNG
50
+
51
+ - 替换纯色背景(**前端 Canvas**,`/tools/replace-bg`,不上传服务器)
52
+ - 用户上传一个图片(提示:仅适合**纯色或大块相近色**背景)
53
+ - **原始色 / 目标色**的设定方式一致:**系统调色板**(原生取色器,通常含放大镜/吸管)、**手动** R/G/B 或 Hex(可「应用 Hex」);可选在预览图上点击取像素色
54
+ - **不透明度** 0–100%(默认 100%,仅作用于替换结果)
55
+ - 与原始色在 RGB 距离 ≤ **容差** 的像素改为目标 RGBA(容差可调)
56
+ - 下方预览,可下载 PNG
57
+
58
+ ## 第三板块:示例
59
+ 这一类是上述创作台的**简化和示例**版本,例如,已经为你写好提示词,选定各种参数,输入参考图片(这里来自asset/目录,有待添加),展示输出。
60
+ 理论上,你用上面的创作台能达到完全一样的效果,只是这里给了你例子。因此这个页面是完全静止的,不涉及model query。
61
+
62
+ 简介:Wake-UP人声乐团是北京大学2025-2026年度十佳歌手冠军,他们比赛现场的彩幕制作过程使用了如下功能。
63
+
64
+ 1. 手写字体生成
65
+ - 展示两张output
66
+
67
+ 2. 在图片上加文字
68
+ - 展示提示词、一张input和output
69
+
70
+ 3. 循环视频制作
71
+ - 展示提示词、一张input和output video
72
+
73
+ 4. 图像超分辨率
74
+ - 展示一张input和output
75
+
76
+ 5. 视频延伸(长视频生成)
77
+ 只展示output,说此功能敬请期待(保持神秘)
78
+
79
+
80
+ # 示例代码
81
+ 所有上述功能都有一个或多个初步的纯代码脚本
82
+ - A. 参考 /Users/lehongwu/Projects/others/lyrics/VideoGeneration-release/run_gen_image_image_cond.sh 和 /Users/lehongwu/Projects/others/lyrics/VideoGeneration-release/run_gen_image_prompt_only.sh
83
+
84
+ - B. 参考 /Users/lehongwu/Projects/others/lyrics/VideoGeneration-release/run_gen_video_prompt_only.sh
85
+
86
+ - C. 参考 /Users/lehongwu/Projects/others/lyrics/VideoGeneration-release/run_gen_video_image_start_end_diff.sh
87
+
88
+ 注意,上述代码不一定包含完整功能,例如,只输入一张图片,而不是0-3张。对于完整document和例子,都参考如下网站:
89
+ https://ai.google.dev/gemini-api/docs/image-generation?hl=zh-cn
90
+ https://ai.google.dev/gemini-api/docs/video?hl=zh-cn
91
+
92
+ # 界面要求
93
+ - 初始有一个输入密码界面,这个密码是launch网站之前由用户设置的环境变量
94
+ - 密码正确后,侧栏有上述多重feature选项,点开其中任意一个,包含:
95
+ 1. 功能简介和指示
96
+ 2. 输入提示词的窗口(不可以为空)
97
+ 3. 留给用户上传图片的空位(feature A/B 有三个参考图空位,均可空;feature C 有起始帧必填、结尾帧可选,并可勾选与起始相同)
98
+ 4. 选择模型、宽高比、分辨率(视频类还有时长,受 API/参考图约束)
99
+ 5. 输出图片/视频的空位
100
+ - 在生成过程中,可以有某种计时器显示运行时间,不要让用户感觉卡住了
101
+
102
+ # 其他要求
103
+ - Gemini api key是launch网站之前由用户设置的环境变量,千万不能泄露或者hard-code在代码里
104
+ - 所有代码是英文,但是网站上的文字(例如介绍)和提示词可以用中文
105
+ - 网站页面风格:请用比较美观的模板和风格,不用太花哨,但是要有艺术气息
106
+
107
+
108
+ # 进一步要求/修改建议
109
+ - 无论我在哪个机器launch这个server,我希望url不要变(debug阶段可以用localhost或者x.x.x.x,但最终开放的版本肯定不行),可以是我自己设计的一个url,但是我希望如果我更换serve的机器这个url不会改变,这样用户能一直用相同url访问。我不确定这个能不能做到,请你给出方案。
110
+
111
+ - 对于上述模型名称、分辨率、宽高比的选项,我希望不是hard-code的list,而是能够有一个独立让我修改的地方,因为这些api支持的选项可能随时变化。
112
+
113
+ **实现说明(Web UI)**:选项列表集中在 `web/config/generation_options.json`(可用环境变量 `GENERATION_OPTIONS_PATH` 指向其他文件)。图片模型、视频/视频生成(首尾过渡)的 Veo 模型名、宽高比、分辨率、时长等均从此处加载;修改后一般无需重编前端,但若改 React/样式需 `cd web/frontend && npm run build` 更新 `web/backend/static/`。
docs/WEB_DEV_GUIDE.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web UI — developer guide
2
+
3
+ How to **set up, run, configure, and deploy** the Gemini Studio Web stack (FastAPI + React). End users only need the public URL and password.
4
+
5
+ For **what to build** (features, UX intent, examples page scope), see **[`SPEC_WEB_UI.md`](./SPEC_WEB_UI.md)**.
6
+
7
+ **Layout:** Application code lives under **`VideoGeneration-release/web/`**. All `from web.backend...` imports assume **`PYTHONPATH`** includes **`VideoGeneration-release`** (the parent of the `web/` directory). Do not set `PYTHONPATH` to `web/` itself—that will break imports.
8
+
9
+ ## 1. Prerequisites
10
+
11
+ - **Python** 3.10+ recommended.
12
+ - **Node.js** 18+ and **npm** (for building the frontend).
13
+ - **ffmpeg** on your `PATH` (same as the CLI video scripts; used to strip audio from MP4s).
14
+
15
+ ## 2. Python virtual environment
16
+
17
+ From **`VideoGeneration-release`** (the directory that contains `web/`):
18
+
19
+ ```bash
20
+ cd /path/to/VideoGeneration-release
21
+ python3 -m venv .venv
22
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
23
+ pip install -r web/requirements.txt
24
+ ```
25
+
26
+ ## 3. Environment variables
27
+
28
+ Set these before starting Uvicorn (or put them in a `.env` file and load with your process manager—**do not** commit real secrets):
29
+
30
+ | Variable | Purpose |
31
+ |----------|---------|
32
+ | `GEMINI_API_KEY` | Gemini API key (server only; never exposed to the browser). |
33
+ | `WEB_UI_PASSWORD` | Login password for the web UI. |
34
+ | `SESSION_SECRET` | Random string for signing session cookies, e.g. `openssl rand -hex 32`. |
35
+ | `GENERATION_OPTIONS_PATH` | Optional. Absolute path to a JSON file that overrides the default option lists. If unset, the server uses `web/config/generation_options.json`. |
36
+
37
+ Example for local debugging:
38
+
39
+ ```bash
40
+ export GEMINI_API_KEY="your_key"
41
+ export WEB_UI_PASSWORD="password"
42
+ export SESSION_SECRET="$(openssl rand -hex 32)"
43
+ ```
44
+
45
+ ## 4. Configurable model / resolution / aspect lists
46
+
47
+ Edit **`web/config/generation_options.json`** (or the file pointed to by `GENERATION_OPTIONS_PATH`). The UI loads these values from **`GET /api/config/generation-options`** after login—no frontend rebuild is required when you change only this JSON. Rebuild the frontend only when you change React/TS/CSS.
48
+
49
+ Schema (informal):
50
+
51
+ - **`image`**: `models` (`value` + `label`; e.g. **(快速)** for Flash vs **(标准)** for Pro), `aspect_ratios`, `resolutions`, `thinking_levels` (`value` + `label`).
52
+ - **`video`** / **`video_frames`**: `models` (`value` + `label`, Veo IDs), `aspect_ratios`, `resolutions`, `durations_seconds`. On **`video`**, each model may set **`supports_reference_images`** (boolean); e.g. Veo 3.1 Lite is **`false`**. With reference images, the backend also forces **8s** duration per API rules. The **首尾帧** API route always uses **8s** (frame-conditioned video does not accept 4/6s like prompt-only 720p).
53
+
54
+ ## 5. Build the frontend once
55
+
56
+ ```bash
57
+ cd web/frontend
58
+ npm install
59
+ npm run build
60
+ ```
61
+
62
+ Output goes to **`web/backend/static/`**. If this directory is missing, the API still runs, but visiting the root URL returns 503 until you build.
63
+
64
+ ## 6. Launch the server
65
+
66
+ From **`VideoGeneration-release`** (parent of `web/`):
67
+
68
+ ```bash
69
+ cd /path/to/VideoGeneration-release
70
+ PYTHONPATH=. uvicorn web.backend.main:app --host 127.0.0.1 --port 8000
71
+ ```
72
+
73
+ For LAN testing from other devices on the same network:
74
+
75
+ ```bash
76
+ cd /path/to/VideoGeneration-release
77
+ PYTHONPATH=. uvicorn web.backend.main:app --host 0.0.0.0 --port 8000
78
+ ```
79
+
80
+ Use `--reload` during development.
81
+
82
+ ## 7. Development mode (hot reload)
83
+
84
+ **Terminal A** — API (cwd = **`VideoGeneration-release`**):
85
+
86
+ ```bash
87
+ cd /path/to/VideoGeneration-release
88
+ PYTHONPATH=. uvicorn web.backend.main:app --reload --host 127.0.0.1 --port 8000
89
+ ```
90
+
91
+ **Terminal B** — Vite (proxies `/api` to port 8000):
92
+
93
+ ```bash
94
+ cd /path/to/VideoGeneration-release/web/frontend && npm run dev
95
+ ```
96
+
97
+ Open the URL Vite prints (e.g. `http://127.0.0.1:5173`). The API key stays on the server; the browser only talks to Vite, which forwards `/api` to Uvicorn.
98
+
99
+ ## 8. Stable URL when you change machines
100
+
101
+ The application does **not** assign a public hostname by itself. A stable URL for users is an **infrastructure** concern:
102
+
103
+ 1. **Own domain + DNS**
104
+ Register a domain (e.g. `studio.example.com`). Create an **A** (or **AAAA**) record pointing to the **current** server’s public IP. When you move to a new machine, update the DNS record to the new IP. Users keep the same hostname.
105
+
106
+ 2. **Static IP or elastic IP**
107
+ If your cloud provider offers a static/elastic IP, attach it to whichever instance runs the app; point your DNS name to that IP.
108
+
109
+ 3. **Reverse proxy**
110
+ Run **nginx** or **Caddy** on the server (or a small VPS in front): TLS termination, `proxy_pass` to `127.0.0.1:8000`. Users hit `https://studio.example.com` only.
111
+
112
+ 4. **Tunnel / no public IP**
113
+ **Cloudflare Tunnel**, **Tailscale Funnel**, or similar gives you a stable hostname without opening ports on your home router; the tunnel endpoint can be repointed when the backend machine changes (depending on the product).
114
+
115
+ 5. **What not to expect**
116
+ Hard-coding `localhost` or a raw IP in the app will not give a stable branded URL. The fix is always: **one DNS name you control** → **current server location**.
117
+
118
+ ## 9. Quick checklist
119
+
120
+ - [ ] `ffmpeg` works: `ffmpeg -version`
121
+ - [ ] `pip install -r web/requirements.txt` in a venv
122
+ - [ ] `npm run build` in `VideoGeneration-release/web/frontend` at least once
123
+ - [ ] Three env vars set: `GEMINI_API_KEY`, `WEB_UI_PASSWORD`, `SESSION_SECRET`
124
+ - [ ] Start Uvicorn with `PYTHONPATH=.` from **`VideoGeneration-release`** (folder that contains `web/`)
125
+
126
+ ## 10. Hugging Face Spaces (Docker)
127
+
128
+ This UI is **not** Streamlit/Gradio; deploy with **`sdk: docker`** and the **`Dockerfile`** at the repo root (same directory as `web/` and `assets/`).
129
+
130
+ 1. Create a **Docker** Space and point it at this repository (or push this folder to a GitHub repo and connect the Space).
131
+ 2. In the Space **Settings → Variables and secrets**, add **Repository secrets** (or Variables) with exactly these names:
132
+
133
+ | Name | Purpose |
134
+ |------|---------|
135
+ | `GEMINI_API_KEY` | Same as local; never commit it. |
136
+ | `WEB_UI_PASSWORD` | Password users type on the login page. |
137
+ | `SESSION_SECRET` | Same as local, e.g. `openssl rand -hex 32`. |
138
+
139
+ Hugging Face injects them as environment variables; the app reads them the same way as on your laptop.
140
+
141
+ 3. The container listens on **`PORT`** if set (Spaces often set it); otherwise **`7860`**. Do not hard-code a port in the app; the provided `Dockerfile` uses `uvicorn ... --port ${PORT:-7860}`.
142
+
143
+ 4. **ffmpeg** is installed in the image (required for stripping audio from generated MP4s).
144
+
145
+ 5. Optional: set **`GENERATION_OPTIONS_PATH`** in the same secrets UI if you mount a custom JSON elsewhere; otherwise the bundled `web/config/generation_options.json` is used.
146
+
147
+ 6. Build can take several minutes on HF; first request after idle may hit cold start.
148
+
149
+ Local test of the image (from **`VideoGeneration-release`**):
150
+
151
+ ```bash
152
+ docker build -t gemini-studio-web .
153
+ docker run --rm -p 7860:7860 \
154
+ -e GEMINI_API_KEY="your_key" \
155
+ -e WEB_UI_PASSWORD="your_password" \
156
+ -e SESSION_SECRET="$(openssl rand -hex 32)" \
157
+ gemini-studio-web
158
+ ```
159
+
160
+ Then open `http://127.0.0.1:7860`.
gen_image_image_cond.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ import os
5
+ import sys
6
+ import threading
7
+ import time
8
+ from pathlib import Path
9
+
10
+ from google import genai
11
+ from google.genai import types
12
+ from PIL import Image
13
+
14
+
15
+ def parse_args() -> argparse.Namespace:
16
+ parser = argparse.ArgumentParser(
17
+ description="Generate an image conditioned on one or more input images using Gemini (Nano Banana)."
18
+ )
19
+ parser.add_argument("--prompt", required=True, help="Prompt describing the desired output image.")
20
+ parser.add_argument(
21
+ "--input-image-path",
22
+ "--input_image_path",
23
+ dest="input_image_path",
24
+ required=True,
25
+ help="Path to the primary conditioning image.",
26
+ )
27
+ parser.add_argument(
28
+ "--extra-image-paths",
29
+ "--extra_image_paths",
30
+ dest="extra_image_paths",
31
+ nargs="*",
32
+ default=[],
33
+ help="Optional additional conditioning image paths (up to 13 total images).",
34
+ )
35
+ parser.add_argument(
36
+ "--model",
37
+ default="gemini-3.1-flash-image-preview",
38
+ help="Image generation model name (e.g. gemini-3.1-flash-image-preview, gemini-3-pro-image-preview, gemini-2.5-flash-image).",
39
+ )
40
+ parser.add_argument("--name", default="img_cond", help="Base output filename (without extension).")
41
+ parser.add_argument(
42
+ "--output-dir",
43
+ "--output_dir",
44
+ dest="output_dir",
45
+ default="output_dir",
46
+ help="Directory to save outputs (default: output_dir).",
47
+ )
48
+ parser.add_argument(
49
+ "--aspect-ratio",
50
+ default="1:1",
51
+ help="Aspect ratio (e.g. 1:1, 16:9, 9:16, 4:3, 3:4, 21:9).",
52
+ )
53
+ parser.add_argument(
54
+ "--resolution",
55
+ default="1K",
56
+ help="Output resolution: 512px, 1K, 2K, or 4K (Gemini 3 models only).",
57
+ )
58
+ parser.add_argument(
59
+ "--number-of-images",
60
+ type=int,
61
+ default=1,
62
+ help="How many images to generate (runs the request N times).",
63
+ )
64
+ parser.add_argument(
65
+ "--thinking-level",
66
+ default=None,
67
+ choices=["minimal", "high"],
68
+ help="Thinking level for Gemini 3.1 Flash Image: 'minimal' or 'high'.",
69
+ )
70
+ return parser.parse_args()
71
+
72
+
73
+ def load_pil_image(image_path: Path) -> Image.Image:
74
+ if not image_path.exists():
75
+ raise FileNotFoundError(f"Input image not found: {image_path}")
76
+ return Image.open(str(image_path))
77
+
78
+
79
+ def build_image_config(args: argparse.Namespace) -> types.ImageConfig:
80
+ kwargs: dict = {"aspect_ratio": args.aspect_ratio}
81
+ gemini3_models = {"gemini-3.1-flash-image-preview", "gemini-3-pro-image-preview"}
82
+ if args.model in gemini3_models:
83
+ kwargs["image_size"] = args.resolution
84
+ return types.ImageConfig(**kwargs)
85
+
86
+
87
+ def generate_one(
88
+ client: genai.Client,
89
+ args: argparse.Namespace,
90
+ image_config: types.ImageConfig,
91
+ pil_images: list[Image.Image],
92
+ ) -> bytes | None:
93
+ config_kwargs: dict = {
94
+ "response_modalities": ["IMAGE"],
95
+ "image_config": image_config,
96
+ }
97
+ if args.thinking_level and args.model == "gemini-3.1-flash-image-preview":
98
+ config_kwargs["thinking_config"] = types.ThinkingConfig(
99
+ thinking_level=args.thinking_level.capitalize(),
100
+ )
101
+
102
+ contents: list = [args.prompt] + pil_images
103
+
104
+ response = client.models.generate_content(
105
+ model=args.model,
106
+ contents=contents,
107
+ config=types.GenerateContentConfig(**config_kwargs),
108
+ )
109
+
110
+ for part in response.parts:
111
+ if part.thought:
112
+ continue
113
+ if part.inline_data is not None:
114
+ return part.inline_data.data
115
+
116
+ return None
117
+
118
+
119
+ def main() -> int:
120
+ args = parse_args()
121
+
122
+ if not os.getenv("GEMINI_API_KEY"):
123
+ print("Missing GEMINI_API_KEY environment variable.", file=sys.stderr)
124
+ return 1
125
+
126
+ primary_path = Path(args.input_image_path).expanduser().resolve()
127
+ all_image_paths = [primary_path] + [
128
+ Path(p).expanduser().resolve() for p in args.extra_image_paths
129
+ ]
130
+
131
+ pil_images: list[Image.Image] = []
132
+ for p in all_image_paths:
133
+ print(f"Loading input image: {p}")
134
+ pil_images.append(load_pil_image(p))
135
+
136
+ client = genai.Client()
137
+ image_config = build_image_config(args)
138
+
139
+ out_dir = Path(args.output_dir)
140
+ out_dir.mkdir(parents=True, exist_ok=True)
141
+
142
+ saved_files: list[str] = []
143
+
144
+ for idx in range(1, args.number_of_images + 1):
145
+ label = f" ({idx}/{args.number_of_images})" if args.number_of_images > 1 else ""
146
+ print(f"Generating image{label}...")
147
+
148
+ result: dict = {}
149
+ thread = threading.Thread(
150
+ target=lambda: result.update({"bytes": generate_one(client, args, image_config, pil_images)}),
151
+ daemon=True,
152
+ )
153
+ started_at = time.time()
154
+ thread.start()
155
+ while thread.is_alive():
156
+ thread.join(timeout=10)
157
+ if thread.is_alive():
158
+ elapsed = int(time.time() - started_at)
159
+ print(f"Waiting for image generation... elapsed: {elapsed}s")
160
+ elapsed = int(time.time() - started_at)
161
+ print(f"Image generation finished in {elapsed}s")
162
+
163
+ image_bytes = result.get("bytes")
164
+ if image_bytes is None:
165
+ print(f"No image returned for generation {idx}.", file=sys.stderr)
166
+ continue
167
+
168
+ if args.number_of_images == 1:
169
+ out_path = out_dir / f"{args.name}.png"
170
+ else:
171
+ out_path = out_dir / f"{args.name}_{idx}.png"
172
+
173
+ out_path.write_bytes(image_bytes)
174
+ saved_files.append(str(out_path.resolve()))
175
+ print(f"Saved image: {out_path.resolve()}")
176
+
177
+ if not saved_files:
178
+ print("No images were saved.", file=sys.stderr)
179
+ return 2
180
+
181
+ metadata: dict = {
182
+ "prompt": args.prompt,
183
+ "model": args.model,
184
+ "input_images": [str(p) for p in all_image_paths],
185
+ "config": {
186
+ "aspect_ratio": args.aspect_ratio,
187
+ "resolution": args.resolution,
188
+ "number_of_images": args.number_of_images,
189
+ "thinking_level": args.thinking_level,
190
+ },
191
+ "saved_images": saved_files,
192
+ }
193
+ metadata_path = out_dir / f"{args.name}.json"
194
+ metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
195
+ print(f"Saved metadata: {metadata_path.resolve()}")
196
+
197
+ return 0
198
+
199
+
200
+ if __name__ == "__main__":
201
+ raise SystemExit(main())
gen_image_prompt_only.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ import os
5
+ import sys
6
+ import threading
7
+ import time
8
+ from pathlib import Path
9
+
10
+ from google import genai
11
+ from google.genai import types
12
+
13
+
14
+ def parse_args() -> argparse.Namespace:
15
+ parser = argparse.ArgumentParser(
16
+ description="Generate an image from a text prompt using Gemini (Nano Banana)."
17
+ )
18
+ parser.add_argument("--prompt", required=True, help="Prompt describing the image.")
19
+ parser.add_argument(
20
+ "--model",
21
+ default="gemini-3.1-flash-image-preview",
22
+ help="Image generation model name (e.g. gemini-3.1-flash-image-preview, gemini-3-pro-image-preview, gemini-2.5-flash-image).",
23
+ )
24
+ parser.add_argument("--name", default="generated_image", help="Base output filename (without extension).")
25
+ parser.add_argument(
26
+ "--output-dir",
27
+ "--output_dir",
28
+ dest="output_dir",
29
+ default="output_dir",
30
+ help="Directory to save outputs (default: output_dir).",
31
+ )
32
+ parser.add_argument(
33
+ "--aspect-ratio",
34
+ default="1:1",
35
+ help="Aspect ratio (e.g. 1:1, 16:9, 9:16, 4:3, 3:4, 21:9).",
36
+ )
37
+ parser.add_argument(
38
+ "--resolution",
39
+ default="1K",
40
+ help="Output resolution: 512px, 1K, 2K, or 4K (Gemini 3 models only).",
41
+ )
42
+ parser.add_argument(
43
+ "--number-of-images",
44
+ type=int,
45
+ default=1,
46
+ help="How many images to generate (runs the request N times).",
47
+ )
48
+ parser.add_argument(
49
+ "--thinking-level",
50
+ default=None,
51
+ choices=["minimal", "high"],
52
+ help="Thinking level for Gemini 3.1 Flash Image: 'minimal' or 'high'.",
53
+ )
54
+ return parser.parse_args()
55
+
56
+
57
+ def build_image_config(args: argparse.Namespace) -> types.ImageConfig:
58
+ kwargs: dict = {"aspect_ratio": args.aspect_ratio}
59
+ gemini3_models = {"gemini-3.1-flash-image-preview", "gemini-3-pro-image-preview"}
60
+ if args.model in gemini3_models:
61
+ kwargs["image_size"] = args.resolution
62
+ return types.ImageConfig(**kwargs)
63
+
64
+
65
+ def generate_one(
66
+ client: genai.Client,
67
+ args: argparse.Namespace,
68
+ image_config: types.ImageConfig,
69
+ ) -> bytes | None:
70
+ config_kwargs: dict = {
71
+ "response_modalities": ["IMAGE"],
72
+ "image_config": image_config,
73
+ }
74
+ if args.thinking_level and args.model == "gemini-3.1-flash-image-preview":
75
+ config_kwargs["thinking_config"] = types.ThinkingConfig(
76
+ thinking_level=args.thinking_level.capitalize(),
77
+ )
78
+
79
+ response = client.models.generate_content(
80
+ model=args.model,
81
+ contents=[args.prompt],
82
+ config=types.GenerateContentConfig(**config_kwargs),
83
+ )
84
+
85
+ for part in response.parts:
86
+ if part.thought:
87
+ continue
88
+ if part.inline_data is not None:
89
+ return part.inline_data.data
90
+
91
+ return None
92
+
93
+
94
+ def main() -> int:
95
+ args = parse_args()
96
+
97
+ if not os.getenv("GEMINI_API_KEY"):
98
+ print("Missing GEMINI_API_KEY environment variable.", file=sys.stderr)
99
+ return 1
100
+
101
+ client = genai.Client()
102
+ image_config = build_image_config(args)
103
+
104
+ out_dir = Path(args.output_dir)
105
+ out_dir.mkdir(parents=True, exist_ok=True)
106
+
107
+ saved_files: list[str] = []
108
+
109
+ for idx in range(1, args.number_of_images + 1):
110
+ label = f" ({idx}/{args.number_of_images})" if args.number_of_images > 1 else ""
111
+ print(f"Generating image{label}...")
112
+
113
+ result: dict = {}
114
+ thread = threading.Thread(
115
+ target=lambda: result.update({"bytes": generate_one(client, args, image_config)}),
116
+ daemon=True,
117
+ )
118
+ started_at = time.time()
119
+ thread.start()
120
+ while thread.is_alive():
121
+ thread.join(timeout=10)
122
+ if thread.is_alive():
123
+ elapsed = int(time.time() - started_at)
124
+ print(f"Waiting for image generation... elapsed: {elapsed}s")
125
+ elapsed = int(time.time() - started_at)
126
+ print(f"Image generation finished in {elapsed}s")
127
+
128
+ image_bytes = result.get("bytes")
129
+ if image_bytes is None:
130
+ print(f"No image returned for generation {idx}.", file=sys.stderr)
131
+ continue
132
+
133
+ if args.number_of_images == 1:
134
+ out_path = out_dir / f"{args.name}.png"
135
+ else:
136
+ out_path = out_dir / f"{args.name}_{idx}.png"
137
+
138
+ out_path.write_bytes(image_bytes)
139
+ saved_files.append(str(out_path.resolve()))
140
+ print(f"Saved image: {out_path.resolve()}")
141
+
142
+ if not saved_files:
143
+ print("No images were saved.", file=sys.stderr)
144
+ return 2
145
+
146
+ metadata: dict = {
147
+ "prompt": args.prompt,
148
+ "model": args.model,
149
+ "config": {
150
+ "aspect_ratio": args.aspect_ratio,
151
+ "resolution": args.resolution,
152
+ "number_of_images": args.number_of_images,
153
+ "thinking_level": args.thinking_level,
154
+ },
155
+ "saved_images": saved_files,
156
+ }
157
+ metadata_path = out_dir / f"{args.name}.json"
158
+ metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
159
+ print(f"Saved metadata: {metadata_path.resolve()}")
160
+
161
+ return 0
162
+
163
+
164
+ if __name__ == "__main__":
165
+ raise SystemExit(main())
gen_lyrics_batch.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generate one image per row of lyrics from a text file.
4
+ Each line is used as the Chinese characters in the image generation prompt.
5
+ """
6
+ import argparse
7
+ import subprocess
8
+ import sys
9
+ from pathlib import Path
10
+
11
+
12
+ def parse_args() -> argparse.Namespace:
13
+ parser = argparse.ArgumentParser(
14
+ description="Generate images for each line of lyrics from a text file."
15
+ )
16
+ parser.add_argument(
17
+ "--lyrics-file",
18
+ "--lyrics_file",
19
+ dest="lyrics_file",
20
+ required=True,
21
+ help="Path to the lyrics text file (one line per image).",
22
+ )
23
+ parser.add_argument(
24
+ "--input-image-path",
25
+ "--input_image_path",
26
+ dest="input_image_path",
27
+ required=True,
28
+ help="Path to the primary conditioning image.",
29
+ )
30
+ parser.add_argument(
31
+ "--output-dir",
32
+ "--output_dir",
33
+ dest="output_dir",
34
+ default="output_dir",
35
+ help="Directory to save outputs (default: output_dir).",
36
+ )
37
+ parser.add_argument(
38
+ "--model",
39
+ default="gemini-3.1-flash-image-preview",
40
+ help="Image generation model name.",
41
+ )
42
+ parser.add_argument(
43
+ "--aspect-ratio",
44
+ default="16:9",
45
+ help="Aspect ratio (e.g. 1:1, 16:9, 9:16).",
46
+ )
47
+ parser.add_argument(
48
+ "--resolution",
49
+ default="2K",
50
+ help="Output resolution: 512px, 1K, 2K, or 4K.",
51
+ )
52
+ parser.add_argument(
53
+ "--extra-image-paths",
54
+ dest="extra_image_paths",
55
+ nargs="*",
56
+ default=[],
57
+ help="Optional additional conditioning image paths.",
58
+ )
59
+ parser.add_argument(
60
+ "--thinking-level",
61
+ default=None,
62
+ choices=["minimal", "high"],
63
+ help="Thinking level for Gemini 3.1 Flash Image.",
64
+ )
65
+ parser.add_argument(
66
+ "--row-ids",
67
+ "--row_ids",
68
+ dest="row_ids",
69
+ type=int,
70
+ nargs="*",
71
+ default=None,
72
+ help="Specific row IDs to generate (1-based). If not set, generate all.",
73
+ )
74
+ return parser.parse_args()
75
+
76
+
77
+ def build_prompt(chars: str) -> str:
78
+ """Build the image generation prompt for the given Chinese characters."""
79
+ return f"""
80
+ Replace the chinese characters with '{chars}'.
81
+ Black text on pure white background. The thickness of the strokes should be consistent with the original image. One character.
82
+ Strictly follow the font of the original image.
83
+ """.strip()
84
+
85
+
86
+ def main() -> int:
87
+ args = parse_args()
88
+
89
+ lyrics_path = Path(args.lyrics_file).expanduser().resolve()
90
+ if not lyrics_path.exists():
91
+ print(f"Error: Lyrics file not found: {lyrics_path}", file=sys.stderr)
92
+ return 1
93
+
94
+ lines = lyrics_path.read_text(encoding="utf-8").strip().splitlines()
95
+ # row_id = 1-based line number in file (correlates to txt row, enables selective generation later)
96
+ rows_to_generate = [(i, line.strip()) for i, line in enumerate(lines, start=1) if line.strip()]
97
+
98
+ if args.row_ids is not None:
99
+ row_ids_set = set(args.row_ids)
100
+ rows_to_generate = [(row_id, chars) for row_id, chars in rows_to_generate if row_id in row_ids_set]
101
+ if not rows_to_generate:
102
+ print("Error: No matching rows found for the given row IDs.", file=sys.stderr)
103
+ return 1
104
+
105
+ if not rows_to_generate:
106
+ print("Error: No non-empty lines in lyrics file.", file=sys.stderr)
107
+ return 1
108
+
109
+ output_dir = Path(args.output_dir)
110
+ output_dir.mkdir(parents=True, exist_ok=True)
111
+
112
+ script_dir = Path(__file__).resolve().parent
113
+ gen_script = script_dir / "gen_image_image_cond.py"
114
+
115
+ for idx, (row_id, chars) in enumerate(rows_to_generate, start=1):
116
+ name = f"row_{row_id}"
117
+ prompt = build_prompt(chars)
118
+
119
+ cmd = [
120
+ sys.executable,
121
+ str(gen_script),
122
+ "--prompt",
123
+ prompt,
124
+ "--input-image-path",
125
+ args.input_image_path,
126
+ "--output-dir",
127
+ str(output_dir),
128
+ "--name",
129
+ name,
130
+ "--model",
131
+ args.model,
132
+ "--aspect-ratio",
133
+ args.aspect_ratio,
134
+ "--resolution",
135
+ args.resolution,
136
+ "--number-of-images",
137
+ "1",
138
+ ]
139
+
140
+ if args.extra_image_paths:
141
+ cmd.extend(["--extra-image-paths"] + args.extra_image_paths)
142
+ if args.thinking_level:
143
+ cmd.extend(["--thinking-level", args.thinking_level])
144
+
145
+ print(f"[{idx}/{len(rows_to_generate)}] Row {row_id}: '{chars}' -> {output_dir / f'{name}.png'}")
146
+ result = subprocess.run(cmd)
147
+ if result.returncode != 0:
148
+ print(f"Error: Failed to generate image for row {row_id} ('{chars}')", file=sys.stderr)
149
+ return result.returncode
150
+
151
+ print(f"\nDone. Generated {len(rows_to_generate)} images in {output_dir}")
152
+ return 0
153
+
154
+
155
+ if __name__ == "__main__":
156
+ raise SystemExit(main())
gen_video_image_start_end.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ import os
5
+ import subprocess
6
+ import sys
7
+ import tempfile
8
+ import time
9
+ from pathlib import Path
10
+
11
+ from google import genai
12
+ from google.genai import types
13
+
14
+
15
+ def parse_args() -> argparse.Namespace:
16
+ parser = argparse.ArgumentParser(
17
+ description=(
18
+ "Generate a video conditioned on a start frame and an optional end frame. "
19
+ "If --end-image-path is omitted, the start image is reused as the end frame."
20
+ )
21
+ )
22
+ parser.add_argument("--prompt", required=True, help="Prompt describing the video.")
23
+ parser.add_argument(
24
+ "--start-image-path",
25
+ "--start_image_path",
26
+ dest="start_image_path",
27
+ required=True,
28
+ help="Path to the image used as the start (first) frame.",
29
+ )
30
+ parser.add_argument(
31
+ "--end-image-path",
32
+ "--end_image_path",
33
+ dest="end_image_path",
34
+ default=None,
35
+ help="Path to the image used as the end (last) frame. Defaults to the start image.",
36
+ )
37
+ parser.add_argument(
38
+ "--model",
39
+ default="veo-3.1-generate-preview",
40
+ help="Video generation model name.",
41
+ )
42
+ parser.add_argument("--name", default="generated_video", help="Base output filename.")
43
+ parser.add_argument(
44
+ "--output-dir",
45
+ "--output_dir",
46
+ dest="output_dir",
47
+ default="output_dir",
48
+ help="Directory to save outputs (default: output_dir).",
49
+ )
50
+ parser.add_argument("--resolution", default="720p", help="e.g. 720p, 1080p, 4k")
51
+ parser.add_argument("--duration", type=int, default=8, help="Video length in seconds.")
52
+ parser.add_argument(
53
+ "--aspect-ratio",
54
+ default="16:9",
55
+ help="Aspect ratio (e.g. 16:9, 9:16).",
56
+ )
57
+ parser.add_argument(
58
+ "--negative-prompt",
59
+ default="blurry, low quality, artifacts, text overlay, watermark",
60
+ help="What to avoid.",
61
+ )
62
+ parser.add_argument(
63
+ "--number-of-videos",
64
+ type=int,
65
+ default=1,
66
+ help="How many videos to generate.",
67
+ )
68
+ parser.add_argument(
69
+ "--poll-seconds",
70
+ type=int,
71
+ default=10,
72
+ help="Polling interval while generation is running.",
73
+ )
74
+ return parser.parse_args()
75
+
76
+
77
+ def strip_audio(video_path: Path) -> None:
78
+ """Remove audio track from video using ffmpeg (video stream copied, no re-encode)."""
79
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
80
+ temp_path = Path(f.name)
81
+ try:
82
+ subprocess.run(
83
+ ["ffmpeg", "-y", "-i", str(video_path), "-an", "-c:v", "copy", str(temp_path)],
84
+ check=True,
85
+ capture_output=True,
86
+ )
87
+ temp_path.replace(video_path)
88
+ finally:
89
+ if temp_path.exists():
90
+ temp_path.unlink()
91
+
92
+
93
+ def load_image(image_path: Path):
94
+ if not image_path.exists():
95
+ raise FileNotFoundError(f"Input image not found: {image_path}")
96
+ try:
97
+ return types.Image.from_file(location=str(image_path))
98
+ except TypeError:
99
+ # Compatibility fallback for SDK variants using positional arg.
100
+ return types.Image.from_file(str(image_path))
101
+
102
+
103
+ def main() -> int:
104
+ args = parse_args()
105
+
106
+ if not os.getenv("GEMINI_API_KEY"):
107
+ print("Missing GEMINI_API_KEY environment variable.", file=sys.stderr)
108
+ return 1
109
+
110
+ start_path = Path(args.start_image_path).expanduser().resolve()
111
+ end_path = Path(args.end_image_path).expanduser().resolve() if args.end_image_path else start_path
112
+
113
+ print(f"Start frame: {start_path}")
114
+ print(f"End frame: {end_path}")
115
+
116
+ first_image = load_image(start_path)
117
+ last_image = load_image(end_path)
118
+
119
+ client = genai.Client()
120
+
121
+ config = types.GenerateVideosConfig(
122
+ resolution=args.resolution,
123
+ duration_seconds=args.duration,
124
+ aspect_ratio=args.aspect_ratio,
125
+ negative_prompt=args.negative_prompt,
126
+ number_of_videos=args.number_of_videos,
127
+ last_frame=last_image,
128
+ )
129
+
130
+ operation = client.models.generate_videos(
131
+ model=args.model,
132
+ prompt=args.prompt,
133
+ image=first_image,
134
+ config=config,
135
+ )
136
+
137
+ started_at = time.time()
138
+ while not operation.done:
139
+ elapsed_seconds = int(time.time() - started_at)
140
+ print(f"Waiting for video generation... elapsed: {elapsed_seconds}s")
141
+ time.sleep(args.poll_seconds)
142
+ operation = client.operations.get(operation)
143
+
144
+ generated = operation.response.generated_videos
145
+ if not generated:
146
+ print("No videos returned by API.", file=sys.stderr)
147
+ return 2
148
+
149
+ out_dir = Path(args.output_dir)
150
+ out_dir.mkdir(parents=True, exist_ok=True)
151
+ base_name = args.name
152
+ saved_files = []
153
+
154
+ if len(generated) == 1:
155
+ video_obj = generated[0].video
156
+ client.files.download(file=video_obj)
157
+ out_path = out_dir / f"{base_name}.mp4"
158
+ video_obj.save(str(out_path))
159
+ strip_audio(out_path)
160
+ saved_files.append(str(out_path.resolve()))
161
+ print(f"Saved video: {out_path.resolve()}")
162
+ else:
163
+ for idx, item in enumerate(generated, start=1):
164
+ video_obj = item.video
165
+ client.files.download(file=video_obj)
166
+ each_path = out_dir / f"{base_name}_{idx}.mp4"
167
+ video_obj.save(str(each_path))
168
+ strip_audio(each_path)
169
+ saved_files.append(str(each_path.resolve()))
170
+ print(f"Saved video: {each_path.resolve()}")
171
+
172
+ metadata_path = out_dir / f"{base_name}.json"
173
+ metadata = {
174
+ "prompt": args.prompt,
175
+ "model": args.model,
176
+ "start_image_path": str(start_path),
177
+ "end_image_path": str(end_path),
178
+ "config": {
179
+ "resolution": args.resolution,
180
+ "duration_seconds": args.duration,
181
+ "aspect_ratio": args.aspect_ratio,
182
+ "negative_prompt": args.negative_prompt,
183
+ "number_of_videos": args.number_of_videos,
184
+ "poll_seconds": args.poll_seconds,
185
+ },
186
+ "saved_videos": saved_files,
187
+ }
188
+ metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
189
+ print(f"Saved metadata: {metadata_path.resolve()}")
190
+ return 0
191
+
192
+
193
+ if __name__ == "__main__":
194
+ raise SystemExit(main())
gen_video_prompt_only.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ import os
5
+ import subprocess
6
+ import sys
7
+ import tempfile
8
+ import time
9
+ from pathlib import Path
10
+
11
+ from google import genai
12
+ from google.genai import types
13
+
14
+
15
+ def strip_audio(video_path: Path) -> None:
16
+ """Remove audio track from video using ffmpeg (video stream copied, no re-encode)."""
17
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
18
+ temp_path = Path(f.name)
19
+ try:
20
+ subprocess.run(
21
+ ["ffmpeg", "-y", "-i", str(video_path), "-an", "-c:v", "copy", str(temp_path)],
22
+ check=True,
23
+ capture_output=True,
24
+ )
25
+ temp_path.replace(video_path)
26
+ finally:
27
+ if temp_path.exists():
28
+ temp_path.unlink()
29
+
30
+
31
+ def parse_args() -> argparse.Namespace:
32
+ parser = argparse.ArgumentParser(
33
+ description="Generate a video from a text prompt using Gemini (Veo)."
34
+ )
35
+ parser.add_argument("--prompt", required=True, help="Prompt describing the video.")
36
+ parser.add_argument(
37
+ "--model",
38
+ default="veo-3.1-generate-preview",
39
+ help="Video generation model name.",
40
+ )
41
+ parser.add_argument("--name", default="generated_video", help="Base output filename.")
42
+ parser.add_argument(
43
+ "--output-dir",
44
+ "--output_dir",
45
+ dest="output_dir",
46
+ default="output_dir",
47
+ help="Directory to save outputs (default: output_dir).",
48
+ )
49
+ parser.add_argument("--resolution", default="1080p", help="e.g. 720p, 1080p, 4k")
50
+ parser.add_argument("--duration", type=int, default=8, help="Video length in seconds.")
51
+ parser.add_argument(
52
+ "--aspect-ratio",
53
+ default="16:9",
54
+ help="Aspect ratio (e.g. 16:9, 9:16, 1:1).",
55
+ )
56
+ parser.add_argument(
57
+ "--negative-prompt",
58
+ default="blurry, low quality, artifacts, text overlay, watermark",
59
+ help="What to avoid.",
60
+ )
61
+ parser.add_argument(
62
+ "--number-of-videos",
63
+ type=int,
64
+ default=1,
65
+ help="How many videos to generate.",
66
+ )
67
+ parser.add_argument(
68
+ "--poll-seconds",
69
+ type=int,
70
+ default=10,
71
+ help="Polling interval while generation is running.",
72
+ )
73
+ return parser.parse_args()
74
+
75
+
76
+ def main() -> int:
77
+ args = parse_args()
78
+
79
+ if not os.getenv("GEMINI_API_KEY"):
80
+ print("Missing GEMINI_API_KEY environment variable.", file=sys.stderr)
81
+ return 1
82
+
83
+ client = genai.Client()
84
+
85
+ config = types.GenerateVideosConfig(
86
+ resolution=args.resolution,
87
+ duration_seconds=args.duration,
88
+ aspect_ratio=args.aspect_ratio,
89
+ negative_prompt=args.negative_prompt,
90
+ number_of_videos=args.number_of_videos,
91
+ )
92
+
93
+ operation = client.models.generate_videos(
94
+ model=args.model,
95
+ prompt=args.prompt,
96
+ config=config,
97
+ )
98
+
99
+ started_at = time.time()
100
+ while not operation.done:
101
+ elapsed_seconds = int(time.time() - started_at)
102
+ print(f"Waiting for video generation... elapsed: {elapsed_seconds}s")
103
+ time.sleep(args.poll_seconds)
104
+ operation = client.operations.get(operation)
105
+
106
+ generated = operation.response.generated_videos
107
+ if not generated:
108
+ print("No videos returned by API.", file=sys.stderr)
109
+ return 2
110
+
111
+ out_dir = Path(args.output_dir)
112
+ out_dir.mkdir(parents=True, exist_ok=True)
113
+ base_name = args.name
114
+ saved_files = []
115
+
116
+ if len(generated) == 1:
117
+ video_obj = generated[0].video
118
+ client.files.download(file=video_obj)
119
+ out_path = out_dir / f"{base_name}.mp4"
120
+ video_obj.save(str(out_path))
121
+ strip_audio(out_path)
122
+ saved_files.append(str(out_path.resolve()))
123
+ print(f"Saved video: {out_path.resolve()}")
124
+ else:
125
+ for idx, item in enumerate(generated, start=1):
126
+ video_obj = item.video
127
+ client.files.download(file=video_obj)
128
+ each_path = out_dir / f"{base_name}_{idx}.mp4"
129
+ video_obj.save(str(each_path))
130
+ strip_audio(each_path)
131
+ saved_files.append(str(each_path.resolve()))
132
+ print(f"Saved video: {each_path.resolve()}")
133
+
134
+ metadata_path = out_dir / f"{base_name}.json"
135
+ metadata = {
136
+ "prompt": args.prompt,
137
+ "model": args.model,
138
+ "config": {
139
+ "resolution": args.resolution,
140
+ "duration_seconds": args.duration,
141
+ "aspect_ratio": args.aspect_ratio,
142
+ "negative_prompt": args.negative_prompt,
143
+ "number_of_videos": args.number_of_videos,
144
+ "poll_seconds": args.poll_seconds,
145
+ },
146
+ "saved_videos": saved_files,
147
+ }
148
+ metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
149
+ print(f"Saved metadata: {metadata_path.resolve()}")
150
+
151
+ return 0
152
+
153
+
154
+ if __name__ == "__main__":
155
+ raise SystemExit(main())
gen_video_prompt_only_extend.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generate a video from a text prompt and optionally extend it multiple times.
4
+ Final length = duration * (num_extend + 1).
5
+ Extension only works with VEO-generated videos (API rejects non-VEO sources).
6
+ """
7
+ import argparse
8
+ import json
9
+ import os
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ import time
14
+ from pathlib import Path
15
+
16
+ from google import genai
17
+ from google.genai import types
18
+
19
+
20
+ def strip_audio(video_path: Path) -> None:
21
+ """Remove audio track from video using ffmpeg (video stream copied, no re-encode)."""
22
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
23
+ temp_path = Path(f.name)
24
+ try:
25
+ subprocess.run(
26
+ ["ffmpeg", "-y", "-i", str(video_path), "-an", "-c:v", "copy", str(temp_path)],
27
+ check=True,
28
+ capture_output=True,
29
+ )
30
+ temp_path.replace(video_path)
31
+ finally:
32
+ if temp_path.exists():
33
+ temp_path.unlink()
34
+
35
+
36
+ def load_image(image_path: Path):
37
+ """Load an image file into a types.Image for video conditioning."""
38
+ if not image_path.exists():
39
+ raise FileNotFoundError(f"Input image not found: {image_path}")
40
+ try:
41
+ return types.Image.from_file(location=str(image_path))
42
+ except TypeError:
43
+ return types.Image.from_file(str(image_path))
44
+
45
+
46
+ def parse_args() -> argparse.Namespace:
47
+ parser = argparse.ArgumentParser(
48
+ description="Generate a video from a text prompt and optionally extend it (VEO only)."
49
+ )
50
+ parser.add_argument(
51
+ "--prompt",
52
+ action="append",
53
+ required=True,
54
+ help="Prompt(s) for video. Pass once for all segments, or num_extend+1 times for initial + each extension.",
55
+ )
56
+ parser.add_argument(
57
+ "--model",
58
+ default="veo-3.1-generate-preview",
59
+ help="Video generation model name.",
60
+ )
61
+ parser.add_argument("--name", default="generated_video", help="Base output filename.")
62
+ parser.add_argument(
63
+ "--output-dir",
64
+ "--output_dir",
65
+ dest="output_dir",
66
+ default="output_dir",
67
+ help="Directory to save outputs (default: output_dir).",
68
+ )
69
+ parser.add_argument("--resolution", default="1080p", help="e.g. 720p, 1080p, 4k")
70
+ parser.add_argument("--duration", type=int, default=8, help="Video length in seconds.")
71
+ parser.add_argument(
72
+ "--aspect-ratio",
73
+ default="16:9",
74
+ help="Aspect ratio (e.g. 16:9, 9:16, 1:1).",
75
+ )
76
+ parser.add_argument(
77
+ "--negative-prompt",
78
+ default="blurry, low quality, artifacts, text overlay, watermark",
79
+ help="What to avoid.",
80
+ )
81
+ parser.add_argument(
82
+ "--number-of-videos",
83
+ type=int,
84
+ default=1,
85
+ help="How many videos to generate. When num-extend > 0, only the first is extended.",
86
+ )
87
+ parser.add_argument(
88
+ "--num-extend",
89
+ type=int,
90
+ default=0,
91
+ help="How many times to extend the video. Final length = duration * (num_extend + 1).",
92
+ )
93
+ parser.add_argument(
94
+ "--start-image",
95
+ "--start_image",
96
+ dest="start_image",
97
+ default=None,
98
+ help="Path to image used as the first frame (initial generation only).",
99
+ )
100
+ parser.add_argument(
101
+ "--end-image",
102
+ "--end_image",
103
+ dest="end_image",
104
+ default=None,
105
+ help="Path to image used as the last frame (initial generation only; extensions do not support image conditioning).",
106
+ )
107
+ parser.add_argument(
108
+ "--poll-seconds",
109
+ type=int,
110
+ default=10,
111
+ help="Polling interval while generation is running.",
112
+ )
113
+ return parser.parse_args()
114
+
115
+
116
+ def main() -> int:
117
+ args = parse_args()
118
+
119
+ if not os.getenv("GEMINI_API_KEY"):
120
+ print("Missing GEMINI_API_KEY environment variable.", file=sys.stderr)
121
+ return 1
122
+
123
+ if args.num_extend < 0:
124
+ print("--num-extend must be >= 0.", file=sys.stderr)
125
+ return 1
126
+
127
+ prompts: list[str] = args.prompt
128
+ if len(prompts) > 1:
129
+ expected = args.num_extend + 1
130
+ if len(prompts) != expected:
131
+ print(
132
+ f"With {len(prompts)} prompts, expected num_extend+1 = {expected}. "
133
+ f"Got num_extend={args.num_extend}.",
134
+ file=sys.stderr,
135
+ )
136
+ return 1
137
+ else:
138
+ prompts = [prompts[0]] * (args.num_extend + 1)
139
+
140
+ client = genai.Client()
141
+
142
+ first_image = None
143
+ if args.start_image:
144
+ start_path = Path(args.start_image).expanduser().resolve()
145
+ first_image = load_image(start_path)
146
+ print(f"Using start image: {start_path}")
147
+
148
+ last_image = None
149
+ if args.end_image:
150
+ end_path = Path(args.end_image).expanduser().resolve()
151
+ last_image = load_image(end_path)
152
+ print(f"Using end image: {end_path}")
153
+
154
+ config_kwargs = {
155
+ "resolution": args.resolution,
156
+ "duration_seconds": args.duration,
157
+ "aspect_ratio": args.aspect_ratio,
158
+ "negative_prompt": args.negative_prompt,
159
+ "number_of_videos": args.number_of_videos,
160
+ }
161
+ if last_image is not None:
162
+ config_kwargs["last_frame"] = last_image
163
+ config = types.GenerateVideosConfig(**config_kwargs)
164
+
165
+ # Initial generation
166
+ print("Generating initial video...")
167
+ gen_kwargs = {"model": args.model, "prompt": prompts[0], "config": config}
168
+ if first_image is not None:
169
+ gen_kwargs["image"] = first_image
170
+ operation = client.models.generate_videos(**gen_kwargs)
171
+
172
+ started_at = time.time()
173
+ while not operation.done:
174
+ elapsed_seconds = int(time.time() - started_at)
175
+ print(f"Waiting for video generation... elapsed: {elapsed_seconds}s")
176
+ time.sleep(args.poll_seconds)
177
+ operation = client.operations.get(operation)
178
+
179
+ if operation.response is None:
180
+ err = getattr(operation, "error", None)
181
+ print(f"API returned no response. Error: {err}", file=sys.stderr)
182
+ return 2
183
+ generated = operation.response.generated_videos
184
+ if not generated:
185
+ print("No videos returned by API.", file=sys.stderr)
186
+ return 2
187
+
188
+ out_dir = Path(args.output_dir)
189
+ out_dir.mkdir(parents=True, exist_ok=True)
190
+ base_name = args.name
191
+ saved_files = []
192
+
193
+ # Save initial video as _1 (when extending, only first is used; when not, save all)
194
+ if args.num_extend > 0:
195
+ video_obj = generated[0].video
196
+ client.files.download(file=video_obj)
197
+ out_path = out_dir / f"{base_name}_1.mp4"
198
+ video_obj.save(str(out_path))
199
+ strip_audio(out_path)
200
+ saved_files.append(str(out_path.resolve()))
201
+ print(f"Saved video: {out_path.resolve()}")
202
+ else:
203
+ for idx, item in enumerate(generated, start=1):
204
+ video_obj = item.video
205
+ client.files.download(file=video_obj)
206
+ out_path = out_dir / f"{base_name}_{idx}.mp4"
207
+ video_obj.save(str(out_path))
208
+ strip_audio(out_path)
209
+ saved_files.append(str(out_path.resolve()))
210
+ print(f"Saved video: {out_path.resolve()}")
211
+
212
+ # Extend num_extend times (only extends the first video; each stage saved as _2, _3, ...)
213
+ for ext_idx in range(args.num_extend):
214
+ print(f"Extending video ({ext_idx + 1}/{args.num_extend})...")
215
+ video_to_extend = generated[0].video
216
+ client.files.download(file=video_to_extend)
217
+ extend_config = types.GenerateVideosConfig(
218
+ number_of_videos=1,
219
+ resolution=args.resolution,
220
+ )
221
+ operation = client.models.generate_videos(
222
+ model=args.model,
223
+ video=video_to_extend,
224
+ prompt=prompts[ext_idx + 1],
225
+ config=extend_config,
226
+ )
227
+
228
+ started_at = time.time()
229
+ while not operation.done:
230
+ elapsed_seconds = int(time.time() - started_at)
231
+ print(f"Waiting for extension... elapsed: {elapsed_seconds}s")
232
+ time.sleep(args.poll_seconds)
233
+ operation = client.operations.get(operation)
234
+
235
+ if operation.response is None:
236
+ err = getattr(operation, "error", None)
237
+ print(f"Extension API returned no response. Error: {err}", file=sys.stderr)
238
+ return 2
239
+ generated = operation.response.generated_videos
240
+ if not generated:
241
+ print("No videos returned by extension API.", file=sys.stderr)
242
+ return 2
243
+
244
+ # Save this extended video as _2, _3, _4, etc.
245
+ video_idx = ext_idx + 2
246
+ video_obj = generated[0].video
247
+ client.files.download(file=video_obj)
248
+ out_path = out_dir / f"{base_name}_{video_idx}.mp4"
249
+ video_obj.save(str(out_path))
250
+ strip_audio(out_path)
251
+ saved_files.append(str(out_path.resolve()))
252
+ print(f"Saved video: {out_path.resolve()}")
253
+
254
+ final_duration_approx = args.duration * (args.num_extend + 1)
255
+ metadata_path = out_dir / f"{base_name}.json"
256
+ metadata = {
257
+ "prompts": prompts,
258
+ "model": args.model,
259
+ "config": {
260
+ "resolution": args.resolution,
261
+ "duration_seconds": args.duration,
262
+ "num_extend": args.num_extend,
263
+ "final_duration_approx_seconds": final_duration_approx,
264
+ "aspect_ratio": args.aspect_ratio,
265
+ "negative_prompt": args.negative_prompt,
266
+ "number_of_videos": args.number_of_videos,
267
+ "poll_seconds": args.poll_seconds,
268
+ "start_image": str(Path(args.start_image).expanduser().resolve()) if args.start_image else None,
269
+ "end_image": str(Path(args.end_image).expanduser().resolve()) if args.end_image else None,
270
+ },
271
+ "saved_videos": saved_files,
272
+ }
273
+ metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
274
+ print(f"Saved metadata: {metadata_path.resolve()}")
275
+ print(f"Final length (approx): {final_duration_approx}s")
276
+
277
+ return 0
278
+
279
+
280
+ if __name__ == "__main__":
281
+ raise SystemExit(main())
generate_lyrics.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ input_image_path=/Users/lehongwu/Projects/others/lyrics/VideoGeneration/contents_zhuyu/qilin_example.png
5
+
6
+ export http_proxy="http://127.0.0.1:7890"
7
+ export https_proxy="http://127.0.0.1:7890"
8
+
9
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
10
+ echo "Error: GEMINI_API_KEY is not set."
11
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
12
+ exit 1
13
+ fi
14
+
15
+ # if ! python3 -c "from google import genai" >/dev/null 2>&1; then
16
+ # echo "Installing dependency: google-genai"
17
+ # python3 -m pip install --upgrade google-genai
18
+ # fi
19
+
20
+ datetime=$(date +%m%d%H%M%S)
21
+ name=gen_lyrics
22
+ output_dir="output_image/${name}_${datetime}"
23
+
24
+ # prompt="
25
+ # A title 'Autumn Leaves' appears in the image, as if composed of leaves.
26
+ # 'Autumn' in the first row and 'Leaves' in the second row.
27
+ # Some leaves of different sizes and positions scatterred around the title, some are partially out of the image.
28
+ # The leaves should look diverse in shapes, sizes, and colors should be red and gold. They come from the example image.
29
+ # Black background.
30
+ # "
31
+
32
+ prompt="
33
+ Put chinese characters '雪白的天色' in the center of the image.
34
+ The sizes of each character should be consistent, and similar with the original image.
35
+ Follow the font style of the original image. Black calligraphy on white background.
36
+ "
37
+
38
+ python gen_image_image_cond.py \
39
+ --prompt "$prompt" \
40
+ --input-image-path "$input_image_path" \
41
+ --model gemini-3.1-flash-image-preview \
42
+ --aspect-ratio 16:9 \
43
+ --resolution 2K \
44
+ --number-of-images 1 \
45
+ --name "$name" \
46
+ --output-dir "$output_dir"
generate_lyrics_batch.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Edit these variables as needed
5
+ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
+ lyrics_file="/Users/lehongwu/Projects/others/lyrics/VideoGeneration/contents_zhuyu/zhuyu_lyrics_v1.txt"
7
+ input_image_path="/Users/lehongwu/Projects/others/lyrics/VideoGeneration/output_image/midian_example_0312135300/midian_example_2.png"
8
+ model="gemini-3.1-flash-image-preview"
9
+ aspect_ratio="16:9"
10
+ resolution="1080p"
11
+
12
+ # Output dir (default: output_image/gen_lyrics_batch_<timestamp>)
13
+ datetime=$(date +%m%d%H%M%S)
14
+ output_dir="${script_dir}/output_image/gen_lyrics_batch_${datetime}"
15
+
16
+ # Specific row IDs to generate (empty = all). e.g. row_ids="1 5 10"
17
+ row_ids="48 49 50 51 52 53 54 55 56 57"
18
+
19
+ # Proxy (optional)
20
+ export http_proxy="${http_proxy:-http://127.0.0.1:7890}"
21
+ export https_proxy="${https_proxy:-http://127.0.0.1:7890}"
22
+
23
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
24
+ echo "Error: GEMINI_API_KEY is not set."
25
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
26
+ exit 1
27
+ fi
28
+
29
+ if [[ ! -f "$lyrics_file" ]]; then
30
+ echo "Error: Lyrics file not found: $lyrics_file"
31
+ exit 1
32
+ fi
33
+
34
+ echo "Lyrics file: $lyrics_file"
35
+ echo "Input image: $input_image_path"
36
+ echo "Output dir: $output_dir"
37
+ [[ -n "$row_ids" ]] && echo "Row IDs: $row_ids"
38
+ echo ""
39
+
40
+ cmd=(python "$script_dir/gen_lyrics_batch.py" \
41
+ --lyrics-file "$lyrics_file" \
42
+ --input-image-path "$input_image_path" \
43
+ --output-dir "$output_dir" \
44
+ --model "$model" \
45
+ --aspect-ratio "$aspect_ratio" \
46
+ --resolution "$resolution")
47
+ [[ -n "$row_ids" ]] && cmd+=(--row-ids $row_ids)
48
+ "${cmd[@]}"
image_super_resolution.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ input_image_path=/Users/lehongwu/Projects/others/lyrics/VideoGeneration/contents_zhuyu/midian_example.png
5
+
6
+ export http_proxy="http://127.0.0.1:7890"
7
+ export https_proxy="http://127.0.0.1:7890"
8
+
9
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
10
+ echo "Error: GEMINI_API_KEY is not set."
11
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
12
+ exit 1
13
+ fi
14
+
15
+ # if ! python3 -c "from google import genai" >/dev/null 2>&1; then
16
+ # echo "Installing dependency: google-genai"
17
+ # python3 -m pip install --upgrade google-genai
18
+ # fi
19
+
20
+ datetime=$(date +%m%d%H%M%S)
21
+ name=super_resolution
22
+ output_dir="output_image/${name}_${datetime}"
23
+
24
+ # prompt="
25
+ # A title 'Autumn Leaves' appears in the image, as if composed of leaves.
26
+ # 'Autumn' in the first row and 'Leaves' in the second row.
27
+ # Some leaves of different sizes and positions scatterred around the title, some are partially out of the image.
28
+ # The leaves should look diverse in shapes, sizes, and colors should be red and gold. They come from the example image.
29
+ # Black background.
30
+ # "
31
+
32
+ prompt="
33
+ Make it higher resolution. Extract black characters on white background, but style unchanged.
34
+ Only keep the second row of text "潮湿的路上" and place it in the center of the image.
35
+ "
36
+
37
+ python gen_image_image_cond.py \
38
+ --prompt "$prompt" \
39
+ --input-image-path "$input_image_path" \
40
+ --model gemini-3.1-flash-image-preview \
41
+ --aspect-ratio 16:9 \
42
+ --resolution 1080p \
43
+ --number-of-images 1 \
44
+ --name "$name" \
45
+ --output-dir "$output_dir"
run_gen_image_image_cond.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ input_image_path=/Users/lehongwu/Projects/others/lyrics/VideoGeneration/output_video/leaves_video_0309184011/debug_frame6.png
5
+
6
+ export http_proxy="http://127.0.0.1:7890"
7
+ export https_proxy="http://127.0.0.1:7890"
8
+
9
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
10
+ echo "Error: GEMINI_API_KEY is not set."
11
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
12
+ exit 1
13
+ fi
14
+
15
+ # if ! python3 -c "from google import genai" >/dev/null 2>&1; then
16
+ # echo "Installing dependency: google-genai"
17
+ # python3 -m pip install --upgrade google-genai
18
+ # fi
19
+
20
+ datetime=$(date +%m%d%H%M%S)
21
+ name=flowers_to_leaves
22
+ output_dir="output_image/${name}_${datetime}"
23
+
24
+ # prompt="
25
+ # Transform this image into: Autumn leaves drifting on the black background, but the locations can follow the original image.
26
+ # The autumn leaves should look diverse in shapes, and colors should be red and gold.
27
+ # The sizes are diverse, especially small ones, as if they are far from the camera.
28
+ # Not all leaves are facing the camera, instead, they are in random directions as if drifting in the wind.
29
+ # But overall, the leaves should be sparse and not too large.
30
+ # "
31
+
32
+ prompt="Change the background to gold and with soft sun glows from top."
33
+
34
+ python gen_image_image_cond.py \
35
+ --prompt "$prompt" \
36
+ --input-image-path "$input_image_path" \
37
+ --model gemini-3.1-flash-image-preview \
38
+ --aspect-ratio 16:9 \
39
+ --resolution 2K \
40
+ --number-of-images 1 \
41
+ --name "$name" \
42
+ --output-dir "$output_dir"
run_gen_image_prompt_only.sh ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ export http_proxy="http://127.0.0.1:7890"
5
+ export https_proxy="http://127.0.0.1:7890"
6
+
7
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
8
+ echo "Error: GEMINI_API_KEY is not set."
9
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
10
+ exit 1
11
+ fi
12
+
13
+ # if ! python3 -c "from google import genai" >/dev/null 2>&1; then
14
+ # echo "Installing dependency: google-genai"
15
+ # python3 -m pip install --upgrade google-genai
16
+ # fi
17
+
18
+ datetime=$(date +%m%d%H%M%S)
19
+ name=leaves
20
+ output_dir="output_image/${name}_${datetime}"
21
+
22
+ prompt="
23
+ Autumn leaves drifting on the black background, but the locations can follow the original image.
24
+ The autumn leaves should look diverse in shapes, sizes, and colors should be red and gold.
25
+ Not all leaves are facing the camera, instead, they are in random directions as if drifting in the wind.
26
+ But overall, the leaves should not be too dense or too large.
27
+ "
28
+
29
+ python gen_image_prompt_only.py \
30
+ --prompt "$prompt" \
31
+ --model gemini-3.1-flash-image-preview \
32
+ --aspect-ratio 16:9 \
33
+ --resolution 2K \
34
+ --number-of-images 1 \
35
+ --name $name \
36
+ --output-dir $output_dir
run_gen_video_image_start_end.sh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ start_image_path=/Users/lehongwu/Projects/others/lyrics/VideoGeneration/contents_zhuyu/bg3_frame0.png
5
+
6
+ export http_proxy="http://127.0.0.1:7890"
7
+ export https_proxy="http://127.0.0.1:7890"
8
+
9
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
10
+ echo "Error: GEMINI_API_KEY is not set."
11
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
12
+ exit 1
13
+ fi
14
+
15
+ datetime=$(date +%m%d%H%M%S)
16
+ name=img_cond_start_end
17
+ output_dir="output/${name}_${datetime}"
18
+
19
+ prompt="
20
+ Transform this image into: The water flows slowly under the moonlight. Loop video.
21
+ "
22
+
23
+ python gen_video_image_start_end.py \
24
+ --prompt "$prompt" \
25
+ --start-image-path "$start_image_path" \
26
+ --resolution 4k \
27
+ --duration 8 \
28
+ --aspect-ratio 16:9 \
29
+ --name "$name" \
30
+ --output_dir "$output_dir"
run_gen_video_image_start_end_diff.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ start_image_path=/Users/lehongwu/Projects/others/VideoGeneration/input/example.png
5
+ end_image_path=/Users/lehongwu/Projects/others/VideoGeneration/input/example_end.png
6
+
7
+ export http_proxy="http://127.0.0.1:7890"
8
+ export https_proxy="http://127.0.0.1:7890"
9
+
10
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
11
+ echo "Error: GEMINI_API_KEY is not set."
12
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
13
+ exit 1
14
+ fi
15
+
16
+ # if ! python3 -c "from google import genai" >/dev/null 2>&1; then
17
+ # echo "Installing dependency: google-genai"
18
+ # python3 -m pip install --upgrade google-genai
19
+ # fi
20
+
21
+ datetime=$(date +%m%d%H%M%S)
22
+ name=img_cond_diff
23
+ output_dir="output/${name}_${datetime}"
24
+
25
+ prompt="
26
+ A cinematic transition where the scene smoothly morphs from the first image to the last image,
27
+ with fluid motion, consistent lighting, and a natural, seamless progression.
28
+ "
29
+
30
+ python gen_video_image_start_end.py \
31
+ --prompt "$prompt" \
32
+ --start-image-path "$start_image_path" \
33
+ --end-image-path "$end_image_path" \
34
+ --resolution 4k \
35
+ --duration 8 \
36
+ --aspect-ratio 16:9 \
37
+ --name "$name" \
38
+ --output_dir "$output_dir"
run_gen_video_prompt_only.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ export http_proxy="http://127.0.0.1:7890"
5
+ export https_proxy="http://127.0.0.1:7890"
6
+
7
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
8
+ echo "Error: GEMINI_API_KEY is not set."
9
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
10
+ exit 1
11
+ fi
12
+
13
+ # if ! python3 -c "from google import genai" >/dev/null 2>&1; then
14
+ # echo "Installing dependency: google-genai"
15
+ # python3 -m pip install --upgrade google-genai
16
+ # fi
17
+
18
+ datetime=$(date +%m%d%H%M%S)
19
+ name=leaves_video
20
+ output_dir="output_video/${name}_${datetime}"
21
+
22
+ prompt="
23
+ Autumn leaves drifting slowly on the black background.
24
+ The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse.
25
+ Overall the leaves should be sparse and extremely small, because this serves as background of some video.
26
+ "
27
+
28
+
29
+ python gen_video_prompt_only.py \
30
+ --prompt "$prompt" \
31
+ --resolution 720p \
32
+ --duration 8 \
33
+ --aspect-ratio 16:9 \
34
+ --name $name \
35
+ --output_dir $output_dir
run_gen_video_prompt_only_extend.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ export http_proxy="http://127.0.0.1:7890"
5
+ export https_proxy="http://127.0.0.1:7890"
6
+
7
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
8
+ echo "Error: GEMINI_API_KEY is not set."
9
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
10
+ exit 1
11
+ fi
12
+
13
+ # if ! python3 -c "from google import genai" >/dev/null 2>&1; then
14
+ # echo "Installing dependency: google-genai"
15
+ # python3 -m pip install --upgrade google-genai
16
+ # fi
17
+
18
+ datetime=$(date +%m%d%H%M%S)
19
+ name=leaves_video_emit
20
+ output_dir="output_video/${name}_${datetime}"
21
+
22
+ # How many times to extend. Final length = duration * (num_extend + 1)
23
+ num_extend=4
24
+
25
+ # Must have num_extend+1 prompts (initial + one per extension)
26
+ prompts=(
27
+ "Autumn leaves drifting slowly on the black background. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. Overall the leaves should be sparse and extremely small. Start from a black image."
28
+ "Autumn leaves drifting slowly on the black background. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. Overall the leaves should be sparse and extremely small. The density and speed of the leaves should be consistent."
29
+ "Autumn leaves drifting slowly on the black background. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. Overall the leaves should be sparse and extremely small. The density and speed of the leaves should be consistent. More leaves are coming in the back."
30
+ "Autumn leaves drifting slowly on the black background. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. More leaves. Even more leaves are coming in the back."
31
+ "Autumn leaves drifting slowly on the black background. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. The density of leaves are consistent. Finally disappear and ends with a black image."
32
+ )
33
+
34
+ prompt_args=()
35
+ for p in "${prompts[@]}"; do
36
+ prompt_args+=(--prompt "$p")
37
+ done
38
+
39
+ # video extension only supports 720p
40
+ python gen_video_prompt_only_extend.py \
41
+ "${prompt_args[@]}" \
42
+ --resolution 720p \
43
+ --duration 8 \
44
+ --aspect-ratio 16:9 \
45
+ --num-extend $num_extend \
46
+ --start-image /Users/lehongwu/Projects/others/lyrics/VideoGeneration/output_video/leaves_video_emit_0310100712/leaves_video_debug_frame0.2.png \
47
+ --end-image /Users/lehongwu/Projects/others/lyrics/VideoGeneration/output_video/leaves_video_emit_0310100712/leaves_video_debug_frame11.png \
48
+ --name $name \
49
+ --output_dir $output_dir
run_gen_video_prompt_only_extend_2.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ export http_proxy="http://127.0.0.1:7890"
5
+ export https_proxy="http://127.0.0.1:7890"
6
+
7
+ if [[ -z "${GEMINI_API_KEY:-}" ]]; then
8
+ echo "Error: GEMINI_API_KEY is not set."
9
+ echo 'Run: export GEMINI_API_KEY="your_api_key"'
10
+ exit 1
11
+ fi
12
+
13
+ # if ! python3 -c "from google import genai" >/dev/null 2>&1; then
14
+ # echo "Installing dependency: google-genai"
15
+ # python3 -m pip install --upgrade google-genai
16
+ # fi
17
+
18
+ datetime=$(date +%m%d%H%M%S)
19
+ name=leaves_video_drop
20
+ output_dir="output_video/${name}_${datetime}"
21
+
22
+ # How many times to extend. Final length = duration * (num_extend + 1)
23
+ num_extend=4
24
+
25
+ # Must have num_extend+1 prompts (initial + one per extension)
26
+ prompts=(
27
+ "Autumn leaves falling down at a fixed speed. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. Change the background to gold and with soft sun glows from top. The dropping speed and leaves density should be consistent."
28
+ "Autumn leaves falling down at a fixed speed. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. Change the background to gold and with soft sun glows from top. The dropping speed and leaves density should be consistent."
29
+ "Autumn leaves falling down at a fixed speed. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. Change the background to gold and with soft sun glows from top. The dropping speed and leaves density should be consistent."
30
+ "Autumn leaves falling down at a fixed speed. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. Change the background to gold and with soft sun glows from top. The dropping speed and leaves density should be consistent."
31
+ "Autumn leaves falling down at a fixed speed. The leaves are of different shapes, colors ranging from red to gold, and distances to camera are diverse. Change the background to gold and with soft sun glows from top. The dropping speed and leaves density should be consistent."
32
+ )
33
+ prompt_args=()
34
+ for p in "${prompts[@]}"; do
35
+ prompt_args+=(--prompt "$p")
36
+ done
37
+
38
+ # video extension only supports 720p
39
+ python gen_video_prompt_only_extend.py \
40
+ "${prompt_args[@]}" \
41
+ --resolution 720p \
42
+ --duration 8 \
43
+ --aspect-ratio 16:9 \
44
+ --num-extend $num_extend \
45
+ --start-image /Users/lehongwu/Projects/others/lyrics/VideoGeneration/output_video/leaves_video_0309184011/debug_frame0.png \
46
+ --end-image /Users/lehongwu/Projects/others/lyrics/VideoGeneration/output_video/leaves_video_0309184011/debug_frame6_gold.png \
47
+ --name $name \
48
+ --output_dir $output_dir
web/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Web application package
web/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (151 Bytes). View file
 
web/backend/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Backend package
web/backend/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (159 Bytes). View file
 
web/backend/__pycache__/config.cpython-312.pyc ADDED
Binary file (1.44 kB). View file
 
web/backend/__pycache__/deps.cpython-312.pyc ADDED
Binary file (776 Bytes). View file