Spaces:

haodongli
/

Lotus-2_Depth

Sleeping

App Files Files Community

haodongli commited on 16 days ago

Commit

436f5aa

1 Parent(s): a58f1cf

init

Browse files

Files changed (8) hide show

.gitignore +6 -0
app.py +103 -0
infer.py +472 -0
infer.sh +30 -0
pipeline.py +214 -0
requirements.txt +14 -0
utils/image_utils.py +514 -0
utils/seed_all.py +33 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__/
+outputs/
+tmp/
+.DS_Store
+weights/
+tmp_*

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import spaces  # must be first!
+import sys
+import os
+import torch
+from PIL import Image
+import gradio as gr
+from glob import glob
+from contextlib import nullcontext
+from pipeline import Lotus2Pipeline
+from diffusers import (
+    FlowMatchEulerDiscreteScheduler,
+    FluxTransformer2DModel,
+)
+from infer import (
+    load_lora_and_lcm_weights,
+    process_single_image
+)
+pipeline = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+weight_dtype = torch.bfloat16
+task = None
+@spaces.GPU
+def load_pipeline():
+    global pipeline, device, weight_dtype, task
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        'black-forest-labs/FLUX.1-dev', subfolder="scheduler", num_train_timesteps=10
+    )
+    transformer = FluxTransformer2DModel.from_pretrained(
+        'black-forest-labs/FLUX.1-dev', subfolder="transformer", revision=None, variant=None
+    )
+    transformer.requires_grad_(False)
+    transformer.to(device=device, dtype=weight_dtype)
+    transformer, local_continuity_module = load_lora_and_lcm_weights(transformer, None, None, None, task)
+    pipeline = Lotus2Pipeline.from_pretrained(
+        'black-forest-labs/FLUX.1-dev',
+        scheduler=noise_scheduler,
+        transformer=transformer,
+        revision=None,
+        variant=None,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.local_continuity_module = local_continuity_module
+    pipeline = pipeline.to(device)
+@spaces.GPU
+def fn(image_path):
+    global pipeline, device, task
+    pipeline.set_progress_bar_config(disable=True)
+    with nullcontext():
+        _, output_vis, _ = process_single_image(
+            image_path, pipeline,
+            task_name=task,
+            device=device,
+            num_inference_steps=10,
+            process_res=1024
+        )
+    return [Image.open(image_path), output_vis]
+def build_demo():
+    global task
+    inputs = [
+        gr.Image(label="Image", type="filepath")
+    ]
+    outputs = [
+        gr.ImageSlider(
+            label=f"{task.title()}",
+            type="pil",
+            slider_position=20,
+        )
+    ]
+    examples = glob(f"assets/demo_examples/{task}/*.png") + glob(f"assets/demo_examples/{task}/*.jpg")
+    demo = gr.Interface(
+        fn=fn,
+        title="Lotus-2: Advancing Geometric Dense Prediction with Powerful Image Generative Model",
+        description=f"""
+            <strong>Please consider starring <span style="color: orange">&#9733;</span> our <a href="https://github.com/EnVision-Research/Lotus-2" target="_blank" rel="noopener noreferrer">GitHub Repo</a> if you find this demo useful! 😊</strong>
+            <br>
+            <strong>Current Task: </strong><strong style="color: red;">{task.title()}</strong>
+        """,
+        inputs=inputs,
+        outputs=outputs,
+        examples=examples,
+        examples_per_page=10
+    )
+    return demo
+def main(task_name):
+    global task
+    task = task_name
+    load_pipeline()
+    demo = build_demo()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=6381,
+    )
+if __name__ == "__main__":
+    task_name = "depth"
+    if not task_name in ['depth', 'normal']:
+        raise ValueError("Invalid task. Please choose from 'depth' and 'normal'.")
+    main(task_name)

infer.py ADDED Viewed

	@@ -0,0 +1,472 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+Lotus-2 Inference Script
+Usage:
+    python infer.py --pretrained_model_name_or_path <model_path> [other_args]
+If --core_predictor_model_path, --lcm_model_path, or --detail_sharpener_model_path
+are not provided, the script will automatically download the corresponding model
+weights from the default HuggingFace repositories.
+"""
+import argparse
+import logging
+import os
+from contextlib import nullcontext
+from pathlib import Path
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from peft import LoraConfig, set_peft_model_state_dict
+from PIL import Image
+from torch import nn
+from tqdm.auto import tqdm
+try:
+    from huggingface_hub import snapshot_download
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+    logging.warning("huggingface_hub not available. Model auto-download will not work.")
+from diffusers import (
+    FlowMatchEulerDiscreteScheduler,
+    FluxTransformer2DModel,
+)
+from diffusers.utils import  convert_unet_state_dict_to_peft
+from utils.image_utils import colorize_depth_map
+from pipeline import Lotus2Pipeline
+from utils.seed_all import seed_all
+# Default HuggingFace repositories and model filenames
+DEFAULT_CORE_PREDICTOR_REPO = "jingheya/Lotus-2"
+DEFAULT_LCM_REPO = "jingheya/Lotus-2"
+DEFAULT_DETAIL_SHARPENER_REPO = "jingheya/Lotus-2"
+CORE_PREDICTOR_FILENAME = {
+    "depth": "lotus-2_core_predictor_depth.safetensors",
+    "normal": "lotus-2_core_predictor_normal.safetensors"
+}
+LCM_FILENAME = {
+    "depth": "lotus-2_lcm_depth.safetensors",
+    "normal": "lotus-2_lcm_normal.safetensors"
+}
+DETAIL_SHARPENER_FILENAME = {
+    "depth": "lotus-2_detail_sharpener_depth.safetensors",
+    "normal": "lotus-2_detail_sharpener_normal.safetensors"
+}
+def get_model_path(model_path, repo_id, filename):
+    """
+    Get the local path for a model. If model_path is None, download from HuggingFace.
+    Args:
+        model_path: Local path to model or None to download from HF
+        repo_id: HuggingFace repository ID
+        filename: Model filename in the repository
+    Returns:
+        Local path to the model file
+    """
+    if model_path is not None:
+        return model_path
+    if not HF_AVAILABLE:
+        raise ImportError(
+            f"huggingface_hub is required for auto-downloading {filename} model weights. "
+            "Please install it with: pip install huggingface_hub"
+        )
+    logging.info(f"Downloading {filename} model weights from {repo_id}/{filename}")
+    try:
+        # Create cache directory if it doesn't exist
+        cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
+        os.makedirs(cache_dir, exist_ok=True)
+        # Download the entire repository and get the specific file
+        repo_path = snapshot_download(
+            repo_id=repo_id,
+            cache_dir=cache_dir,
+            local_files_only=False,
+        )
+        # Construct the full path to the specific file
+        full_path = os.path.join(repo_path, filename)
+        if not os.path.exists(full_path):
+            # Try to find the file in the repo
+            for root, dirs, files in os.walk(repo_path):
+                if filename in files:
+                    full_path = os.path.join(root, filename)
+                    break
+            else:
+                raise FileNotFoundError(f"Could not find {filename} in the downloaded repository")
+        logging.info(f"Successfully downloaded {filename} model to: {full_path}")
+        return full_path
+    except Exception as e:
+        raise RuntimeError(f"Failed to download {filename} model from {repo_id}: {str(e)}")
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+# check_min_version("0.33.0.dev0")
+class Local_Continuity_Module(nn.Module):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.lcm = nn.Sequential(
+            nn.Conv2d(num_channels, num_channels * 2, kernel_size=3, padding=1),
+            nn.GELU(),
+            nn.Conv2d(num_channels * 2, num_channels, kernel_size=3, padding=1),
+        )
+    def forward(self, x):
+        lcm_dtype = next(self.lcm.parameters()).dtype
+        if x.dtype != lcm_dtype:
+            x = x.to(dtype=lcm_dtype)
+        return x + self.lcm(x)
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Run Lotus-2.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--core_predictor_model_path",
+        type=str,
+        default=None,
+        help="Path to core predictor model weights",
+    )
+    parser.add_argument(
+        "--lcm_model_path",
+        type=str,
+        default=None,
+        help="Path to local continuity module model weights",
+    )
+    parser.add_argument(
+        "--detail_sharpener_model_path",
+        type=str,
+        default=None,
+        help="Path to detail sharpener model weights",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--process_res",
+        type=int,
+        default=768,
+        help="The resolution for processing the images.",
+    )
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=10,
+        help="Number of timesteps to infer the model.",
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default=None,
+        help="The directory where the input images are stored.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="flux-dreambooth-lora",
+        help="The output directory where the model predictions will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="Random seed.")
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="depth", # "normal"
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    return args
+def process_single_image(image_path, pipeline, task_name, device,
+                         num_inference_steps, process_res=768):
+    image = Image.open(image_path).convert("RGB")
+    image_np = np.array(image).astype(np.float32)
+    image_ts = torch.tensor(image_np).permute(2,0,1).unsqueeze(0)
+    image_ts = image_ts / 127.5 - 1.0
+    image_ts = image_ts.to(device)
+    prediction = pipeline(
+        rgb_in=image_ts,
+        prompt='',
+        num_inference_steps=num_inference_steps,
+        output_type='np',
+        process_res=process_res,
+        ).images[0]
+    if task_name == "depth":
+        output_npy = prediction.mean(axis=-1)
+        output_vis = colorize_depth_map(output_npy, reverse_color=True)
+    elif task_name == "normal":
+        output_npy = prediction
+        output_vis = Image.fromarray((output_npy * 255).astype(np.uint8))
+    else:
+        raise ValueError(f"Invalid task name: {task_name}")
+    return image, output_vis, output_npy
+def load_lora_and_lcm_weights(transformer, core_predictor_model_path, lcm_model_path, detail_sharpener_model_path, task_name):
+    lora_rank = 128 if task_name == 'depth' else 256
+    device = transformer.device
+    weight_dtype = transformer.dtype
+    target_lora_modules = [
+        "attn.to_k",
+        "attn.to_q",
+        "attn.to_v",
+        "attn.to_out.0",
+        "attn.add_k_proj",
+        "attn.add_q_proj",
+        "attn.add_v_proj",
+        "attn.to_add_out",
+        "ff.net.0.proj",
+        "ff.net.2",
+        "ff_context.net.0.proj",
+        "ff_context.net.2",
+    ]
+    # Auto-download models if paths are None
+    core_predictor_model_path = get_model_path(
+        core_predictor_model_path,
+        DEFAULT_CORE_PREDICTOR_REPO,
+        CORE_PREDICTOR_FILENAME[task_name]
+    )
+    lcm_model_path = get_model_path(
+        lcm_model_path,
+        DEFAULT_LCM_REPO,
+        LCM_FILENAME[task_name]
+    )
+    detail_sharpener_model_path = get_model_path(
+        detail_sharpener_model_path,
+        DEFAULT_DETAIL_SHARPENER_REPO,
+        DETAIL_SHARPENER_FILENAME[task_name]
+    )
+    # load lora weights for core predictor
+    core_transformer_lora_config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_rank,
+        init_lora_weights="gaussian",
+        target_modules=target_lora_modules,
+    )
+    transformer.add_adapter(core_transformer_lora_config, adapter_name="core_predictor")
+    core_lora_state_dict = Lotus2Pipeline.lora_state_dict(core_predictor_model_path)
+    core_transformer_state_dict = {
+        f'{k.replace("transformer.", "")}': v for k, v in core_lora_state_dict.items() if k.startswith("transformer.")
+    }
+    core_transformer_state_dict = convert_unet_state_dict_to_peft(core_transformer_state_dict)
+    incompatible_keys = set_peft_model_state_dict(transformer, core_transformer_state_dict, adapter_name="core_predictor")
+    if incompatible_keys is not None:
+        # check only for unexpected keys
+        unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+        if unexpected_keys:
+            logging.warning(
+                f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                f" {unexpected_keys}. "
+            )
+    for name, param in transformer.named_parameters():
+        if "core_predictor" in name:
+            param.requires_grad = False
+    # transformer.to(device=device, dtype=weight_dtype)
+    logging.info(f"Successfully loaded lora weights for [core predictor].")
+    # stage1 lcm weights
+    local_continuity_module = Local_Continuity_Module(transformer.config.in_channels//4)
+    lcm_state_dict = torch.load(lcm_model_path, map_location="cpu", weights_only=True)
+    local_continuity_module.load_state_dict(lcm_state_dict)
+    local_continuity_module.requires_grad_(False)
+    local_continuity_module.to(device=device, dtype=weight_dtype)
+    logging.info(f"Successfully loaded weights for [local continuity module (LCM)].")
+    # stage2 lora weights (detail sharpener)
+    sharpener_transformer_lora_config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_rank,
+        init_lora_weights="gaussian",
+        target_modules=target_lora_modules,
+    )
+    transformer.add_adapter(sharpener_transformer_lora_config, adapter_name="detail_sharpener")
+    sharpener_lora_state_dict = Lotus2Pipeline.lora_state_dict(detail_sharpener_model_path)
+    sharpener_transformer_state_dict = {
+        f'{k.replace("transformer.", "")}': v for k, v in sharpener_lora_state_dict.items() if k.startswith("transformer.")
+    }
+    sharpener_transformer_state_dict = convert_unet_state_dict_to_peft(sharpener_transformer_state_dict)
+    incompatible_keys = set_peft_model_state_dict(transformer, sharpener_transformer_state_dict, adapter_name="detail_sharpener")
+    if incompatible_keys is not None:
+        # check only for unexpected keys
+        unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+        if unexpected_keys:
+            logging.warning(
+                f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                f" {unexpected_keys}. "
+            )
+    # freeze the stage2 lora
+    for name, param in transformer.named_parameters():
+        if "detail_sharpener" in name:
+            param.requires_grad = False
+    # transformer.to(device=device, dtype=weight_dtype)
+    logging.info(f"Successfully loaded lora weights for [detail sharpener].")
+    return transformer, local_continuity_module
+def main(args):
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logging.info("Run Lotus-2! ")
+    # -------------------- Preparation --------------------
+    # Check if model paths are provided, if not, they will be auto-downloaded from HuggingFace
+    if args.core_predictor_model_path is None or args.lcm_model_path is None or args.detail_sharpener_model_path is None:
+        if HF_AVAILABLE:
+            logging.info("Some model paths are not provided. Model weights will be automatically downloaded from HuggingFace.")
+            logging.info(f"Core predictor repo: {DEFAULT_CORE_PREDICTOR_REPO}")
+            logging.info(f"LCM repo: {DEFAULT_LCM_REPO}")
+            logging.info(f"Detail sharpener repo: {DEFAULT_DETAIL_SHARPENER_REPO}")
+        else:
+            logging.warning("Some model paths are not provided and huggingface_hub is not available.")
+            logging.warning("Please install huggingface_hub: pip install huggingface_hub")
+            logging.warning("Or provide local paths for all model weights.")
+    # Random seed
+    if args.seed is not None:
+        seed_all(args.seed)
+    # Output directories
+    os.makedirs(args.output_dir, exist_ok=True)
+    output_dir_vis = os.path.join(args.output_dir, f'{args.task_name}_vis')
+    output_dir_npy = os.path.join(args.output_dir, f'{args.task_name}_npy')
+    if not os.path.exists(output_dir_vis): os.makedirs(output_dir_vis)
+    if not os.path.exists(output_dir_npy): os.makedirs(output_dir_npy)
+    logging.info(f"Output dir = {args.output_dir}")
+    # Mixed precision
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    else:
+        weight_dtype = torch.float32
+    logging.info(f"Running with {weight_dtype} precision.")
+    # Device
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+        logging.warning("CUDA is not available. Running on CPU will be slow.")
+    logging.info(f"Device = {device}")
+    # -------------------- Data --------------------
+    input_dir = Path(args.input_dir)
+    test_images = list(input_dir.rglob('*.png')) + list(input_dir.rglob('*.jpg'))
+    test_images = sorted(test_images)
+    logging.info(f'==> There are {len(test_images)} images for validation.')
+    # -------------------- Load scheduler and models --------------------
+    # scheduler
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="scheduler", num_train_timesteps=10
+    )
+    # transformer
+    transformer = FluxTransformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant
+    )
+    transformer.requires_grad_(False)
+    transformer.to(device=device, dtype=weight_dtype)
+    # load weights
+    transformer, local_continuity_module = load_lora_and_lcm_weights(transformer,
+                                            args.core_predictor_model_path,
+                                            args.lcm_model_path,
+                                            args.detail_sharpener_model_path,
+                                            args.task_name
+                                            )
+    # -------------------- Pipeline --------------------
+    pipeline = Lotus2Pipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        scheduler=noise_scheduler,
+        transformer=transformer,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.local_continuity_module = local_continuity_module
+    pipeline = pipeline.to(device)
+    # -------------------- Run inference! --------------------
+    pipeline.set_progress_bar_config(disable=True)
+    with nullcontext():
+        for image_path in tqdm(test_images):
+            # print("\n",image_path)
+            _, output_vis, output_npy = process_single_image(
+                image_path, pipeline,
+                task_name=args.task_name,
+                device=device,
+                num_inference_steps=args.num_inference_steps,
+                process_res=args.process_res
+            )
+            output_vis.save(os.path.join(output_dir_vis, f'{image_path.stem}.png'))
+            np.save(os.path.join(output_dir_npy, f'{image_path.stem}.npy'), output_npy)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

infer.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+export OPENCV_IO_ENABLE_OPENEXR=1
+export TOKENIZERS_PARALLELISM=false
+export TASK_NAME="normal"
+# paths
+export MODEL_NAME="black-forest-labs/FLUX.1-dev"
+# export CORE_PREDICTOR_MODEL_PATH="weights/lotus-2_core_predictor_$TASK_NAME.safetensors"
+# export DETAIL_SHARPENER_MODEL_PATH="weights/lotus-2_detail_sharpener_$TASK_NAME.safetensors"
+# export LCM_MODEL_PATH="weights/lotus-2_lcm_$TASK_NAME.safetensors"
+export INPUT_DIR="assets"
+export OUTPUT_DIR="outputs/infer/"
+# configs
+export NUM_INFERENCE_STEPS=10
+CUDA_VISIBLE_DEVICES=0 python infer.py \
+    --pretrained_model_name_or_path=$MODEL_NAME \
+    --input_dir=$INPUT_DIR \
+    --output_dir=$OUTPUT_DIR \
+    --mixed_precision="bf16" \
+    --num_inference_steps=$NUM_INFERENCE_STEPS \
+    --seed="0" \
+    --task_name=$TASK_NAME \
+    --process_res=1024
+    # --core_predictor_model_path=$CORE_PREDICTOR_MODEL_PATH \
+    # --detail_sharpener_model_path=$DETAIL_SHARPENER_MODEL_PATH \
+    # --lcm_model_path=$LCM_MODEL_PATH \

pipeline.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from typing import Union, Optional, List, Dict, Any
+import numpy as np
+import torch
+from diffusers import FluxPipeline
+from diffusers.pipelines.flux import FluxPipelineOutput
+from diffusers.pipelines.flux.pipeline_flux import calculate_shift, retrieve_timesteps
+from diffusers.utils import is_torch_xla_available
+from utils.image_utils import resize_image, resize_image_first
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+class Lotus2Pipeline(FluxPipeline):
+    @torch.no_grad()
+    def __call__(
+        self,
+        rgb_in: Optional[torch.FloatTensor] = None,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 10,
+        output_type: Optional[str] = "pil",
+        process_res: Optional[int] = None,
+        timestep_core_predictor: int = 1,
+        guidance_scale: float = 3.5,
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            rgb_in (`torch.FloatTensor`, *optional*):
+                The input image to be used for generation.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the prediction. Default is ''.
+            num_inference_steps (`int`, *optional*, defaults to 10):
+                The number of denoising steps. More denoising steps usually lead to a sharper prediction at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        # 1. prepare
+        batch_size = rgb_in.shape[0]
+        input_size = rgb_in.shape[2:]
+        rgb_in = resize_image_first(rgb_in, process_res)
+        height, width = rgb_in.shape[2:]
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        device = self._execution_device
+        # 2. encode prompt
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=None,
+            device=device,
+        )
+        # 3. prepare latent variables
+        rgb_in = rgb_in.to(device=device, dtype=self.dtype)
+        rgb_latents = self.vae.encode(rgb_in).latent_dist.sample()
+        rgb_latents = (rgb_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        packed_rgb_latents = self._pack_latents(
+            rgb_latents,
+            batch_size=rgb_latents.shape[0],
+            num_channels_latents=rgb_latents.shape[1],
+            height=rgb_latents.shape[2],
+            width=rgb_latents.shape[3],
+        )
+        latent_image_ids_core_predictor = self._prepare_latent_image_ids(batch_size, rgb_latents.shape[2]//2, rgb_latents.shape[3]//2, device, rgb_latents.dtype)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, rgb_latents.shape[2]//2, rgb_latents.shape[3]//2, device, rgb_latents.dtype)
+        # 4. prepare timesteps
+        timestep_core_predictor = torch.tensor(timestep_core_predictor).expand(batch_size).to(device=rgb_in.device, dtype=rgb_in.dtype)
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = packed_rgb_latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) # 0
+        self._num_timesteps = len(timesteps)
+        # 5. handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(packed_rgb_latents.shape[0])
+        else:
+            guidance = None
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+        # 6. core predictor
+        self.transformer.set_adapter("core_predictor")
+        latents = self.transformer(
+            hidden_states=packed_rgb_latents,
+            timestep=timestep_core_predictor / 1000,
+            guidance=guidance,
+            pooled_projections=pooled_prompt_embeds,
+            encoder_hidden_states=prompt_embeds,
+            txt_ids=text_ids,
+            img_ids=latent_image_ids_core_predictor,
+            joint_attention_kwargs=self.joint_attention_kwargs, # {}
+            return_dict=False,
+        )[0]
+        latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        latents = self.local_continuity_module(latents)
+        # 7. Denoising loop for detail sharpener
+        self.transformer.set_adapter("detail_sharpener")
+        latents = self._pack_latents(
+            latents,
+            batch_size=latents.shape[0],
+            num_channels_latents=latents.shape[1],
+            height=latents.shape[2],
+            width=latents.shape[3],
+        )
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        latents = latents.to(dtype=self.dtype)
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Resize output image to match input size
+        image = resize_image(image, input_size)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+numpy==1.26.4
+matplotlib==3.10.0
+peft==0.14.0
+protobuf==5.29.0
+sentencepiece==0.2.0
+opencv-python==4.11.0.86
+huggingface-hub==0.36.0
+diffusers==0.32.2
+torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121
+torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
+gradio==5.49.0
+gradio-client==1.13.3
+gradio-imageslider==0.0.20
+spaces==0.42.1

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,514 @@

+from PIL import Image
+import matplotlib
+import numpy as np
+from typing import List
+import csv
+import cv2
+from PIL import Image
+import torch
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import resize
+def numpy_to_pil(images: np.ndarray) -> List[Image.Image]:
+    r"""
+    Convert a numpy image or a batch of images to a PIL image.
+    Args:
+        images (`np.ndarray`):
+            The image array to convert to PIL format.
+    Returns:
+        `List[PIL.Image.Image]`:
+            A list of PIL images.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+    else:
+        pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+def resize_output(image, target_size):
+    """
+    Resize output image to target size
+    Args:
+        image: Image in PIL.Image, numpy.array or torch.tensor format
+        target_size: tuple, target size (H, W)
+    Returns:
+        Resized image in original format
+    """
+    if isinstance(image, list):
+        return [resize_output(img, target_size) for img in image]
+    if isinstance(image, Image.Image):
+        return image.resize(target_size[::-1], Image.BILINEAR)
+    elif isinstance(image, np.ndarray):
+        # Handle numpy array with shape (1, H, W, 3)
+        if image.ndim == 4:
+            resized = np.stack([cv2.resize(img, target_size[::-1]) for img in image])
+            return resized
+        else:
+            return cv2.resize(image, target_size[::-1])
+    elif isinstance(image, torch.Tensor):
+        # Handle tensor with shape (1, 3, H, W)
+        if image.dim() == 4:
+            return torch.nn.functional.interpolate(
+                image,
+                size=target_size,
+                mode='bilinear',
+                align_corners=False
+            )
+        else:
+            return torch.nn.functional.interpolate(
+                image.unsqueeze(0),
+                size=target_size,
+                mode='bilinear',
+                align_corners=False
+            ).squeeze(0)
+    else:
+        raise ValueError(f"Unsupported image format: {type(image)}")
+def resize_image(image, target_size):
+    """
+    Resize output image to target size
+    Args:
+        image: Image in PIL.Image, numpy.array or torch.tensor format
+        target_size: tuple, target size (H, W)
+    Returns:
+        Resized image in original format
+    """
+    if isinstance(image, list):
+        return [resize_image(img, target_size) for img in image]
+    if isinstance(image, Image.Image):
+        return image.resize(target_size[::-1], Image.BILINEAR)
+    elif isinstance(image, np.ndarray):
+        # Handle numpy array with shape (1, H, W, 3)
+        if image.ndim == 4:
+            resized = np.stack([cv2.resize(img, target_size[::-1]) for img in image])
+            return resized
+        else:
+            return cv2.resize(image, target_size[::-1])
+    elif isinstance(image, torch.Tensor):
+        # Handle tensor with shape (1, 3, H, W)
+        if image.dim() == 4:
+            return torch.nn.functional.interpolate(
+                image,
+                size=target_size,
+                mode='bilinear',
+                align_corners=False
+            )
+        else:
+            return torch.nn.functional.interpolate(
+                image.unsqueeze(0),
+                size=target_size,
+                mode='bilinear',
+                align_corners=False
+            ).squeeze(0)
+    else:
+        raise ValueError(f"Unsupported image format: {type(image)}")
+def resize_image_first(image_tensor, process_res=None):
+    if process_res:
+        max_edge = max(image_tensor.shape[2], image_tensor.shape[3])
+        if max_edge > process_res:
+            scale = process_res / max_edge
+            new_height = int(image_tensor.shape[2] * scale)
+            new_width = int(image_tensor.shape[3] * scale)
+            image_tensor = resize_image(image_tensor, (new_height, new_width))
+    image_tensor = resize_to_multiple_of_16(image_tensor)
+    return image_tensor
+def smooth_image(image, method='gaussian', kernel_size=31, sigma=15.0, bilateral_d=9, bilateral_color=75, bilateral_space=75):
+    """
+    应用多种平滑方法来消除图像中的网格伪影
+    Args:
+        image: PIL.Image, numpy.array 或 torch.tensor 格式的图像
+        method: 平滑方法，可选 'gaussian'(高斯模糊), 'bilateral'(双边滤波), 'median'(中值滤波),
+                'guided'(引导滤波), 'strong'(结合多种滤波的强力平滑)
+        kernel_size: 高斯和中值滤波的核大小，默认为31，应为奇数
+        sigma: 高斯滤波的标准差，默认为15.0
+        bilateral_d: 双边滤波的直径，默认为9
+        bilateral_color: 双边滤波的颜色空间标准差，默认为75
+        bilateral_space: 双边滤波的坐标空间标准差，默认为75
+    Returns:
+        平滑后的图像，保持原始格式
+    """
+    if isinstance(image, list):
+        return [smooth_image(img, method, kernel_size, sigma, bilateral_d, bilateral_color, bilateral_space) for img in image]
+    # 确保kernel_size是奇数
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    # 转换为numpy数组进行处理
+    is_pil = isinstance(image, Image.Image)
+    is_tensor = isinstance(image, torch.Tensor)
+    if is_pil:
+        img_array = np.array(image)
+    elif is_tensor:
+        device = image.device
+        if image.dim() == 4:  # (B, C, H, W)
+            batch_size, channels, height, width = image.shape
+            img_array = image.permute(0, 2, 3, 1).cpu().numpy()  # (B, H, W, C)
+        else:  # (C, H, W)
+            img_array = image.permute(1, 2, 0).cpu().numpy()  # (H, W, C)
+    else:
+        img_array = image
+    # 保存原始数据类型
+    original_dtype = img_array.dtype
+    # 应用选定的平滑方法
+    if method == 'gaussian':
+        # 标准高斯模糊，适合轻微平滑
+        if img_array.ndim == 4:
+            smoothed = np.stack([cv2.GaussianBlur(img, (kernel_size, kernel_size), sigma) for img in img_array])
+        else:
+            smoothed = cv2.GaussianBlur(img_array, (kernel_size, kernel_size), sigma)
+    elif method == 'bilateral':
+        # 双边滤波，保持边缘的同时平滑平坦区域
+        if img_array.ndim == 4:
+            # 确保图像是8位类型
+            imgs_uint8 = [img.astype(np.uint8) if img.dtype != np.uint8 else img for img in img_array]
+            smoothed = np.stack([cv2.bilateralFilter(img, bilateral_d, bilateral_color, bilateral_space) for img in imgs_uint8])
+            # 转回原始类型
+            if original_dtype != np.uint8:
+                smoothed = smoothed.astype(original_dtype)
+        else:
+            # 确保图像是8位类型
+            img_uint8 = img_array.astype(np.uint8) if img_array.dtype != np.uint8 else img_array
+            smoothed = cv2.bilateralFilter(img_uint8, bilateral_d, bilateral_color, bilateral_space)
+            # 转回原始类型
+            if original_dtype != np.uint8:
+                smoothed = smoothed.astype(original_dtype)
+    elif method == 'median':
+        # 中值滤波，对于消除盐和胡椒噪声和小格子非常有效
+        # 中值滤波要求输入为uint8或uint16
+        if img_array.ndim == 4:
+            # 转换为8位无符号整数并确保格式正确
+            imgs_uint8 = []
+            for img in img_array:
+                # 对浮点图像进行缩放到0-255范围
+                if img.dtype != np.uint8:
+                    if img.max() <= 1.0:  # 检查是否是0-1范围的浮点数
+                        img = (img * 255).astype(np.uint8)
+                    else:
+                        img = img.astype(np.uint8)
+                imgs_uint8.append(img)
+            smoothed = np.stack([cv2.medianBlur(img, kernel_size) for img in imgs_uint8])
+            # 转回原始类型
+            if original_dtype != np.uint8:
+                if original_dtype == np.float32 or original_dtype == np.float64:
+                    if img_array.max() <= 1.0:  # 检查原始数据是否在0-1范围
+                        smoothed = smoothed.astype(float) / 255.0
+        else:
+            # 转换为8位无符号整数
+            if img_array.dtype != np.uint8:
+                if img_array.max() <= 1.0:  # 检查是否是0-1范围的浮点数
+                    img_uint8 = (img_array * 255).astype(np.uint8)
+                else:
+                    img_uint8 = img_array.astype(np.uint8)
+            else:
+                img_uint8 = img_array
+            smoothed = cv2.medianBlur(img_uint8, kernel_size)
+            # 转回原始类型
+            if original_dtype != np.uint8:
+                if original_dtype == np.float32 or original_dtype == np.float64:
+                    if img_array.max() <= 1.0:  # 检查原始数据是否在0-1范围
+                        smoothed = smoothed.astype(float) / 255.0
+                    else:
+                        smoothed = smoothed.astype(original_dtype)
+    elif method == 'guided':
+        # 引导滤波，在保持边缘的同时平滑区域
+        if img_array.ndim == 4:
+            smoothed = np.stack([cv2.ximgproc.guidedFilter(
+                guide=img, src=img, radius=kernel_size//2, eps=1e-6) for img in img_array])
+        else:
+            smoothed = cv2.ximgproc.guidedFilter(
+                guide=img_array, src=img_array, radius=kernel_size//2, eps=1e-6)
+    elif method == 'strong':
+        # 强力平滑：先应用中值滤波去除尖锐噪点，然后用双边滤波保持边缘，最后用高斯进一步平滑
+        if img_array.ndim == 4:
+            # 转换为8位无符号整数
+            imgs_uint8 = []
+            for img in img_array:
+                # 对浮点图像进行缩放到0-255范围
+                if img.dtype != np.uint8:
+                    if img.max() <= 1.0:  # 检查是否是0-1范围的浮点数
+                        img = (img * 255).astype(np.uint8)
+                    else:
+                        img = img.astype(np.uint8)
+                imgs_uint8.append(img)
+            temp = np.stack([cv2.medianBlur(img, min(15, kernel_size)) for img in imgs_uint8])
+            temp = np.stack([cv2.bilateralFilter(img, bilateral_d, bilateral_color, bilateral_space) for img in temp])
+            smoothed = np.stack([cv2.GaussianBlur(img, (kernel_size, kernel_size), sigma) for img in temp])
+            # 转回原始类型
+            if original_dtype != np.uint8:
+                if original_dtype == np.float32 or original_dtype == np.float64:
+                    if img_array.max() <= 1.0:  # 检查原始数据是否在0-1范围
+                        smoothed = smoothed.astype(float) / 255.0
+                    else:
+                        smoothed = smoothed.astype(original_dtype)
+        else:
+            # 转换为8位无符号整数
+            if img_array.dtype != np.uint8:
+                if img_array.max() <= 1.0:  # 检查是否是0-1范围的浮点数
+                    img_uint8 = (img_array * 255).astype(np.uint8)
+                else:
+                    img_uint8 = img_array.astype(np.uint8)
+            else:
+                img_uint8 = img_array
+            temp = cv2.medianBlur(img_uint8, min(15, kernel_size))
+            temp = cv2.bilateralFilter(temp, bilateral_d, bilateral_color, bilateral_space)
+            smoothed = cv2.GaussianBlur(temp, (kernel_size, kernel_size), sigma)
+            # 转回原始类型
+            if original_dtype != np.uint8:
+                if original_dtype == np.float32 or original_dtype == np.float64:
+                    if img_array.max() <= 1.0:  # 检查原始数据是否在0-1范围
+                        smoothed = smoothed.astype(float) / 255.0
+                    else:
+                        smoothed = smoothed.astype(original_dtype)
+    else:
+        raise ValueError(f"不支持的平滑方法: {method}，请选择 'gaussian', 'bilateral', 'median', 'guided' 或 'strong'")
+    # 将结果转换回原始格式
+    if is_pil:
+        # 如果结果是浮点类型且值在0-1之间，需要先转换为0-255的uint8
+        if smoothed.dtype == np.float32 or smoothed.dtype == np.float64:
+            if smoothed.max() <= 1.0:
+                smoothed = (smoothed * 255).astype(np.uint8)
+        return Image.fromarray(smoothed.astype(np.uint8))
+    elif is_tensor:
+        if image.dim() == 4:
+            return torch.from_numpy(smoothed).permute(0, 3, 1, 2).to(device)
+        else:
+            return torch.from_numpy(smoothed).permute(2, 0, 1).to(device)
+    else:
+        return smoothed
+def resize_to_multiple_of_16(image_tensor):
+    """
+    Resize image tensor to make shorter side closest multiple of 16 while maintaining aspect ratio
+    Args:
+        image_tensor: Input tensor of shape (B, C, H, W)
+    Returns:
+        Resized tensor where shorter side is multiple of 16
+    """
+    # Calculate scale ratio based on shorter side to make it closest multiple of 16
+    h, w = image_tensor.shape[2], image_tensor.shape[3]
+    min_side = min(h, w)
+    scale = (min_side // 16) * 16 / min_side
+    # Calculate new height and width
+    new_h = int(h * scale)
+    new_w = int(w * scale)
+    # Ensure both height and width are multiples of 16
+    new_h = (new_h // 16) * 16
+    new_w = (new_w // 16) * 16
+    # Resize image while maintaining aspect ratio
+    resized_tensor = torch.nn.functional.interpolate(
+        image_tensor,
+        size=(new_h, new_w),
+        mode='bilinear',
+        align_corners=False
+    )
+    return resized_tensor
+def load_color_list(csv_path):
+    color_list = []
+    with open(csv_path, newline='') as file:
+        reader = csv.reader(file)
+        next(reader)
+        for row in reader:
+            last_three = tuple(map(int, row[-3:]))
+            color_list.append(last_three)
+    color_list = [(0,0,0)] + color_list
+    return color_list
+def conver_rgb_to_semantic_map(image: Image, color_list: List):
+    # Convert PIL Image to numpy array
+    image_array = np.array(image)
+    # Initialize an empty array for the indexed image
+    indexed_image = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=int)
+    # Loop through each pixel in the image
+    for i in range(image_array.shape[0]):
+        for j in range(image_array.shape[1]):
+            # Get the color of the current pixel
+            pixel_color = tuple(image_array[i, j][:3])  # Exclude the alpha channel if present
+            # Find the closest color from the color list and get its index
+            # Here, the Euclidean distance is used to find the closest color
+            distances = np.sqrt(np.sum((np.array(color_list) - np.array(pixel_color))**2, axis=1))
+            closest_color_index = np.argmin(distances)
+            # Set the index in the indexed image
+            indexed_image[i, j] = closest_color_index
+    indexed_image = indexed_image - 1
+    return indexed_image
+def concatenate_images(*image_lists):
+    # Ensure at least one image list is provided
+    if not image_lists or not image_lists[0]:
+        raise ValueError("At least one non-empty image list must be provided")
+    # Determine the maximum width of any single row and the total height
+    max_width = 0
+    total_height = 0
+    row_widths = []
+    row_heights = []
+    # Compute dimensions for each row
+    for image_list in image_lists:
+        if image_list:  # Ensure the list is not empty
+            width = sum(img.width for img in image_list)
+            height = max(img.height for img in image_list)
+            max_width = max(max_width, width)
+            total_height += height
+            row_widths.append(width)
+            row_heights.append(height)
+    # Create a new image to concatenate everything into
+    new_image = Image.new('RGB', (max_width, total_height))
+    # Concatenate each row of images
+    y_offset = 0
+    for i, image_list in enumerate(image_lists):
+        x_offset = 0
+        for img in image_list:
+            new_image.paste(img, (x_offset, y_offset))
+            x_offset += img.width
+        y_offset += row_heights[i]  # Move the offset down to the next row
+    return new_image
+# def concatenate_images(image_list1, image_list2):
+#     # Ensure both image lists are not empty
+#     if not image_list1 or not image_list2:
+#         raise ValueError("Image lists cannot be empty")
+#     # Get the width and height of the first image
+#     width, height = image_list1[0].size
+#     # Calculate the total width and height
+#     total_width = max(len(image_list1), len(image_list2)) * width
+#     total_height = 2 * height  # For two rows
+#     # Create a new image to concatenate everything into
+#     new_image = Image.new('RGB', (total_width, total_height))
+#     # Concatenate the first row of images
+#     x_offset = 0
+#     for img in image_list1:
+#         new_image.paste(img, (x_offset, 0))
+#         x_offset += img.width
+#     # Concatenate the second row of images
+#     x_offset = 0
+#     for img in image_list2:
+#         new_image.paste(img, (x_offset, height))
+#         x_offset += img.width
+#     return new_image
+def colorize_depth_map(depth, mask=None, reverse_color=False):
+    cm = matplotlib.colormaps["Spectral"]
+    # normalize
+    depth = ((depth - depth.min()) / (depth.max() - depth.min()))
+    # colorize
+    if reverse_color:
+        img_colored_np = cm(1 - depth, bytes=False)[:, :, 0:3]  # Invert the depth values before applying colormap
+    else:
+        img_colored_np = cm(depth, bytes=False)[:, :, 0:3] # (h,w,3)
+    depth_colored = (img_colored_np * 255).astype(np.uint8)
+    if mask is not None:
+        masked_image = np.zeros_like(depth_colored)
+        masked_image[mask.numpy()] = depth_colored[mask.numpy()]
+        depth_colored_img = Image.fromarray(masked_image)
+    else:
+        depth_colored_img = Image.fromarray(depth_colored)
+    return depth_colored_img
+def resize_max_res(
+    img: torch.Tensor,
+    max_edge_resolution: int,
+    resample_method: InterpolationMode = InterpolationMode.BILINEAR,
+) -> torch.Tensor:
+    """
+    Resize image to limit maximum edge length while keeping aspect ratio.
+    Args:
+        img (`torch.Tensor`):
+            Image tensor to be resized. Expected shape: [B, C, H, W]
+        max_edge_resolution (`int`):
+            Maximum edge length (pixel).
+        resample_method (`PIL.Image.Resampling`):
+            Resampling method used to resize images.
+    Returns:
+        `torch.Tensor`: Resized image.
+    """
+    assert 4 == img.dim(), f"Invalid input shape {img.shape}"
+    original_height, original_width = img.shape[-2:]
+    downscale_factor = min(
+        max_edge_resolution / original_width, max_edge_resolution / original_height
+    )
+    new_width = int(original_width * downscale_factor)
+    new_height = int(original_height * downscale_factor)
+    resized_img = resize(img, (new_height, new_width), resample_method, antialias=True)
+    return resized_img
+def get_tv_resample_method(method_str: str) -> InterpolationMode:
+    resample_method_dict = {
+        "bilinear": InterpolationMode.BILINEAR,
+        "bicubic": InterpolationMode.BICUBIC,
+        "nearest": InterpolationMode.NEAREST_EXACT,
+        "nearest-exact": InterpolationMode.NEAREST_EXACT,
+    }
+    resample_method = resample_method_dict.get(method_str, None)
+    if resample_method is None:
+        raise ValueError(f"Unknown resampling method: {resample_method}")
+    else:
+        return resample_method

utils/seed_all.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import numpy as np
+import random
+import torch
+def seed_all(seed: int = 0):
+    """
+    Set random seeds of all components.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)