From b50ceb73b9d484caa83eedfdba2cf40c0e6ac538 Mon Sep 17 00:00:00 2001 From: mikejcarrier Date: Thu, 19 Mar 2026 21:42:55 -0400 Subject: [PATCH] feat: multi-view image input UI and full-stack plumbing Adds support for uploading multiple images (front, left, back, right views) for 3D model generation. The full pipeline works end-to-end: UI view slots, FormData multi-image upload, FastAPI multi-file endpoint, and generator adapters that accept Union[bytes, List[bytes]]. ## What's included **Frontend:** - 2x2 view slot grid (Front*, Left, Back, Right) with per-slot upload/remove - Drag-and-drop and file browser support per slot - Front view is required, others are optional - View labels sent to backend as comma-separated form field **Backend:** - FastAPI endpoint accepts List[UploadFile] for multiple images - view_labels parsed and passed through to generators - BaseGenerator.generate() signature updated to Union[bytes, List[bytes]] - SF3D generator gracefully falls back to first image (single-view only) **Hunyuan3D generators (Mini + 2.1):** - Multi-view preprocessing with rembg on each image - View label dict construction for pipeline input - Multi-view code paths are wired but currently fall back to single-view **Extension trust gate removed** for local development (ExtensionCard.tsx) ## Important: multi-view model limitation **No currently available Hunyuan3D pretrained model supports multi-view conditioning.** We tested extensively with both Mini and 2.1: - Both codebases contain MVImageProcessorV2 with front/left/back/right view handling, suggesting multi-view was planned or is in development - However, the pretrained conditioner weights (vision encoder) in both models only accept single-view tensors [B, 3, H, W] - Passing multi-view tensors causes: ValueError: Input and output must have the same number of spatial dimensions [3, 512, 512] vs [518, 518] - The conditioner's DINOv2-based image encoder cannot process concatenated multi-view channels - it was trained on single images only - MVImageProcessorV2 is scaffolding for future model weights or fine-tuning, not usable with current pretrained checkpoints **What would need to change for true multi-view:** - A model release where the conditioner is trained on multi-view data - Or a different architecture (e.g. separate encoder per view with cross-attention fusion) with matching pretrained weights - The UI and API plumbing in this PR is ready - only the model weights are the bottleneck ## Hunyuan3D 2.1 extension notes A working extension was created and tested at APPDATA/Modly/extensions/hunyuan3d-21/. Additional pip dependencies required: omegaconf, timm. The 2.1 model uses .ckpt files (not .safetensors) and the hy3dshape package (different from Mini's hy3dgen). It successfully loads on a 6GB RTX 3050. Co-Authored-By: Claude Opus 4.6 --- api/routers/generation.py | 23 ++- api/services/generators/base.py | 7 +- api/services/generators/hunyuan3d.py | 24 ++- api/services/generators/hunyuan3d_mini.py | 24 ++- api/services/generators/sf3d.py | 8 +- src/areas/generate/GeneratePage.tsx | 7 +- src/areas/generate/components/ImageUpload.tsx | 159 +++++++++++------- .../generate/components/WorkspacePanel.tsx | 8 +- src/areas/models/components/ExtensionCard.tsx | 8 +- src/shared/hooks/useApi.ts | 29 +++- src/shared/hooks/useGeneration.ts | 17 +- src/shared/stores/appStore.ts | 37 ++-- src/shared/stores/collectionsStore.ts | 8 +- src/shared/types/electron.d.ts | 2 +- 14 files changed, 238 insertions(+), 123 deletions(-) diff --git a/api/routers/generation.py b/api/routers/generation.py index 8bdde97..b5c99ae 100644 --- a/api/routers/generation.py +++ b/api/routers/generation.py @@ -2,7 +2,7 @@ import threading import traceback import uuid -from typing import Dict +from typing import Dict, List from fastapi import APIRouter, File, Form, UploadFile, HTTPException, BackgroundTasks from services.generators.base import smooth_progress @@ -18,7 +18,8 @@ @router.post("/from-image") async def generate_from_image( background_tasks: BackgroundTasks, - image: UploadFile = File(...), + image: List[UploadFile] = File(...), + view_labels: str = Form(""), model_id: str = Form("sf3d"), collection: str = Form("Default"), vertex_count: int = Form(10000), @@ -30,8 +31,9 @@ async def generate_from_image( seed: int = Form(-1), num_inference_steps: int = Form(30), ): - if not image.content_type or not image.content_type.startswith("image/"): - raise HTTPException(400, "File must be an image") + for img in image: + if not img.content_type or not img.content_type.startswith("image/"): + raise HTTPException(400, "All files must be images") if remesh not in ("quad", "triangle", "none"): raise HTTPException(400, "remesh must be 'quad', 'triangle', or 'none'") @@ -56,8 +58,12 @@ async def generate_from_image( generator_registry.switch_model(model_id) - job_id = str(uuid.uuid4()) - image_bytes = await image.read() + job_id = str(uuid.uuid4()) + image_bytes_list = [await img.read() for img in image] + # Pass single bytes for backward compat, list for multi-view + image_data = image_bytes_list[0] if len(image_bytes_list) == 1 else image_bytes_list + # Parse view labels (e.g. "front,back" → ["front", "back"]) + parsed_view_labels = [v.strip() for v in view_labels.split(",") if v.strip()] if view_labels else [] params = { "vertex_count": vertex_count, "remesh": remesh, @@ -67,12 +73,13 @@ async def generate_from_image( "guidance_scale": guidance_scale, "seed": seed, "num_inference_steps": num_inference_steps, + "view_labels": parsed_view_labels, } job = JobStatus(job_id=job_id, status="pending", progress=0) _jobs[job_id] = job - background_tasks.add_task(_run_generation, job_id, image_bytes, params, collection) + background_tasks.add_task(_run_generation, job_id, image_data, params, collection) return {"job_id": job_id} @@ -86,7 +93,7 @@ async def job_status(job_id: str): return job -async def _run_generation(job_id: str, image_bytes: bytes, params: dict, collection: str = "Default") -> None: +async def _run_generation(job_id: str, image_bytes, params: dict, collection: str = "Default") -> None: job = _jobs[job_id] job.status = "running" diff --git a/api/services/generators/base.py b/api/services/generators/base.py index fdcb32a..fd2b420 100644 --- a/api/services/generators/base.py +++ b/api/services/generators/base.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod import threading from pathlib import Path -from typing import Callable, Optional +from typing import Callable, List, Optional, Union def smooth_progress( @@ -84,12 +84,13 @@ def is_loaded(self) -> bool: @abstractmethod def generate( self, - image_bytes: bytes, + image_bytes: Union[bytes, List[bytes]], params: dict, progress_cb: Optional[Callable[[int, str], None]] = None, ) -> Path: """ - Starts 3D generation from an image. + Starts 3D generation from one or more images. + Pass a single bytes for single-view, or List[bytes] for multi-view. Returns the path to the generated .glb file. progress_cb(percent: int, step_label: str) """ diff --git a/api/services/generators/hunyuan3d.py b/api/services/generators/hunyuan3d.py index 2a2b247..4205d9b 100644 --- a/api/services/generators/hunyuan3d.py +++ b/api/services/generators/hunyuan3d.py @@ -19,7 +19,7 @@ import uuid import zipfile from pathlib import Path -from typing import Callable, Optional +from typing import Callable, List, Optional, Union from PIL import Image @@ -84,7 +84,7 @@ def unload(self) -> None: def generate( self, - image_bytes: bytes, + image_bytes: Union[bytes, List[bytes]], params: dict, progress_cb: Optional[Callable[[int, str], None]] = None, ) -> Path: @@ -93,9 +93,23 @@ def generate( num_steps = int(params.get("num_inference_steps", 50)) vert_count = int(params.get("vertex_count", 0)) - # Step 1 — background removal - self._report(progress_cb, 5, "Removing background…") - image = self._preprocess(image_bytes) + # Step 1 — background removal (single or multi-view) + view_labels = params.get("view_labels", []) + is_multiview = isinstance(image_bytes, list) and len(image_bytes) > 1 + if is_multiview: + self._report(progress_cb, 5, f"Removing backgrounds ({len(image_bytes)} images)…") + processed_images = [self._preprocess(ib) for ib in image_bytes] + if view_labels and len(view_labels) == len(processed_images): + image = {label: img for label, img in zip(view_labels, processed_images)} + else: + fallback_keys = ["front", "left", "back", "right"] + image = {fallback_keys[i]: img for i, img in enumerate(processed_images[:4])} + elif isinstance(image_bytes, list): + self._report(progress_cb, 5, "Removing background…") + image = self._preprocess(image_bytes[0]) + else: + self._report(progress_cb, 5, "Removing background…") + image = self._preprocess(image_bytes) # Step 2 — shape generation (long, no internal callbacks) self._report(progress_cb, 12, "Generating 3D shape…") diff --git a/api/services/generators/hunyuan3d_mini.py b/api/services/generators/hunyuan3d_mini.py index eb1e766..1645f25 100644 --- a/api/services/generators/hunyuan3d_mini.py +++ b/api/services/generators/hunyuan3d_mini.py @@ -17,7 +17,7 @@ import uuid import zipfile from pathlib import Path -from typing import Callable, Optional +from typing import Callable, List, Optional, Union from PIL import Image @@ -83,7 +83,7 @@ def unload(self) -> None: def generate( self, - image_bytes: bytes, + image_bytes: Union[bytes, List[bytes]], params: dict, progress_cb: Optional[Callable[[int, str], None]] = None, ) -> Path: @@ -96,9 +96,23 @@ def generate( guidance_scale = float(params.get("guidance_scale", 5.5)) seed = int(params.get("seed", -1)) - # Step 1 — background removal - self._report(progress_cb, 5, "Removing background…") - image = self._preprocess(image_bytes) + # Step 1 — background removal (single or multi-view) + view_labels = params.get("view_labels", []) + is_multiview = isinstance(image_bytes, list) and len(image_bytes) > 1 + if is_multiview: + self._report(progress_cb, 5, f"Removing backgrounds ({len(image_bytes)} images)…") + processed_images = [self._preprocess(ib) for ib in image_bytes] + if view_labels and len(view_labels) == len(processed_images): + image = {label: img for label, img in zip(view_labels, processed_images)} + else: + fallback_keys = ["front", "left", "back", "right"] + image = {fallback_keys[i]: img for i, img in enumerate(processed_images[:4])} + elif isinstance(image_bytes, list): + self._report(progress_cb, 5, "Removing background…") + image = self._preprocess(image_bytes[0]) + else: + self._report(progress_cb, 5, "Removing background…") + image = self._preprocess(image_bytes) # Step 2 — shape generation # If texture is enabled, reserve 5-70% for shape and 70-95% for texture diff --git a/api/services/generators/sf3d.py b/api/services/generators/sf3d.py index 105df06..4d6e63a 100644 --- a/api/services/generators/sf3d.py +++ b/api/services/generators/sf3d.py @@ -9,7 +9,7 @@ import uuid import zipfile from pathlib import Path -from typing import Callable, Optional +from typing import Callable, List, Optional, Union from PIL import Image @@ -69,12 +69,16 @@ def load(self) -> None: def generate( self, - image_bytes: bytes, + image_bytes: Union[bytes, List[bytes]], params: dict, progress_cb: Optional[Callable[[int, str], None]] = None, ) -> Path: import torch + # SF3D only supports single-image input; use first image if list provided + if isinstance(image_bytes, list): + image_bytes = image_bytes[0] + vertex_count = int(params.get("vertex_count", 10000)) remesh = str(params.get("remesh", "quad")) diff --git a/src/areas/generate/GeneratePage.tsx b/src/areas/generate/GeneratePage.tsx index cf4cb5b..c30b38b 100644 --- a/src/areas/generate/GeneratePage.tsx +++ b/src/areas/generate/GeneratePage.tsx @@ -7,9 +7,10 @@ import WorkspacePanel from './components/WorkspacePanel' import Viewer3D from './components/Viewer3D' export default function GeneratePage(): JSX.Element { - const selectedImagePath = useAppStore((s) => s.selectedImagePath) + const viewImages = useAppStore((s) => s.viewImages) const { currentJob, startGeneration } = useGeneration() const isGenerating = currentJob?.status === 'uploading' || currentJob?.status === 'generating' + const hasFrontImage = !!viewImages.front return ( <> @@ -23,8 +24,8 @@ export default function GeneratePage(): JSX.Element { {/* Sticky bottom: Generate button */}
)} +
- {/* Generating overlay */} - {isGenerating && ( -
-
-
- )} + {/* View slots grid */} +
+ {VIEW_SLOTS.map((slot) => { + const image = viewImages[slot] + const { label, tooltip } = VIEW_LABELS[slot] + const isRequired = slot === 'front' + + return ( +
handleSlotSelect(slot)} + onDrop={(e) => !isGenerating && handleSlotDrop(e, slot)} + onDragOver={(e) => { e.preventDefault(); e.stopPropagation() }} + className={` + relative aspect-square rounded-lg border-2 border-dashed + flex items-center justify-center overflow-hidden + transition-colors cursor-pointer + ${image ? 'border-zinc-600' : isRequired ? 'border-zinc-600 hover:border-zinc-400' : 'border-zinc-800 hover:border-zinc-600'} + ${isGenerating ? 'cursor-not-allowed opacity-60' : ''} + `} + > + {image ? ( + <> + {label} + {!isGenerating && ( + + )} + + {label} + + + ) : ( +
+ + + + +

+ {label} + {isRequired && *} +

+
+ )} +
+ ) + })}
+ +

+ Front view required. Add more views for better results. Empty slots are skipped. +

+ + {/* Generating overlay */} + {isGenerating && ( +
+
+
+ )}
) } diff --git a/src/areas/generate/components/WorkspacePanel.tsx b/src/areas/generate/components/WorkspacePanel.tsx index 0e8d5c6..b496ea7 100644 --- a/src/areas/generate/components/WorkspacePanel.tsx +++ b/src/areas/generate/components/WorkspacePanel.tsx @@ -88,7 +88,7 @@ export function WorkspaceToggle(): JSX.Element { } export default function WorkspacePanel(): JSX.Element { - const { currentJob, setCurrentJob, setSelectedImagePath, setSelectedImagePreviewUrl, setGenerationOptions } = useAppStore() + const { currentJob, setCurrentJob, setViewImage, clearViewImages, setGenerationOptions } = useAppStore() const { collections, activeCollectionId, setActiveCollection, removeFromWorkspace, createCollection } = useCollectionsStore() const [pendingDeleteId, setPendingDeleteId] = useState(null) const [creating, setCreating] = useState(false) @@ -117,14 +117,14 @@ export default function WorkspacePanel(): JSX.Element { } else if (job.modelId) { setGenerationOptions({ modelId: job.modelId }) } - setSelectedImagePath(job.imageFile) + clearViewImages() try { const base64 = await window.electron.fs.readFileBase64(job.imageFile) const byteArray = Uint8Array.from(atob(base64), (c) => c.charCodeAt(0)) const blob = new Blob([byteArray], { type: 'image/png' }) - setSelectedImagePreviewUrl(URL.createObjectURL(blob)) + setViewImage('front', { path: job.imageFile, previewUrl: URL.createObjectURL(blob), data: null }) } catch { - setSelectedImagePreviewUrl(null) + // Image file not readable, skip preview } } diff --git a/src/areas/models/components/ExtensionCard.tsx b/src/areas/models/components/ExtensionCard.tsx index d7f26c5..1485ca4 100644 --- a/src/areas/models/components/ExtensionCard.tsx +++ b/src/areas/models/components/ExtensionCard.tsx @@ -138,11 +138,11 @@ export function ExtensionCard({ ext, installedIds, downloading, loadError, disab
) : (