medmekk HF Staff commited on Oct 29

Commit

12075d1

verified ·

1 Parent(s): e47b92d

Upload folder using huggingface_hub

Browse files

Files changed (40) hide show

.gitattributes +6 -0
README.md +10 -0
build.toml +25 -0
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__init__.py +12 -0
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_ops.py +9 -0
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__init__.py +12 -0
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_ops.py +9 -0
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__init__.py +12 -0
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_ops.py +9 -0
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__init__.py +12 -0
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_ops.py +9 -0
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__init__.py +12 -0
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_ops.py +9 -0
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__init__.py +12 -0
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_ops.py +9 -0
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
flake.lock +168 -0
flake.nix +13 -0
sam3_kernels/connected_components.cu +305 -0
sam3_kernels/generic_nms.cu +331 -0
torch-ext/sam3_kernels/__init__.py +12 -0
torch-ext/torch_binding.cpp +14 -0
torch-ext/torch_binding.h +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+tags:
+- kernels
+- sam3
+---
+# sam3_kernels
+This is a build for some kernel utilities that are used in the SAM3 model in transformers

build.toml ADDED Viewed

	@@ -0,0 +1,25 @@

+[general]
+name = "sam3_kernels"
+universal = false
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
+]
+[kernel.sam3_kernels]
+depends = ["torch"]
+backend = "cuda"
+src = [
+    "sam3_kernels/connected_components.cu",
+    "sam3_kernels/generic_nms.cu",
+]
+cuda-flags = [
+    "-DCUDA_HAS_FP16=1",
+    "-D__CUDA_NO_HALF_OPERATORS__",
+    "-D__CUDA_NO_HALF_CONVERSIONS__",
+    "-D__CUDA_NO_HALF2_OPERATORS__",
+]

build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from typing import List
+from ._ops import ops
+def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
+    return ops.cc_2d(inputs, get_counts)
+def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
+    return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
+__all__ = ["cc_2d", "generic_nms"]

build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.01 kB). View file

build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (546 Bytes). View file

build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _sam3_kernels_19700101000000
+ops = torch.ops._sam3_kernels_19700101000000
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sam3_kernels_19700101000000::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3df6a1fdcf8c683e752af841ae9faa83e5b8b16e97fcc88d643b443e67c4714e
+size 2550384

build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from typing import List
+from ._ops import ops
+def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
+    return ops.cc_2d(inputs, get_counts)
+def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
+    return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
+__all__ = ["cc_2d", "generic_nms"]

build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.01 kB). View file

build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (546 Bytes). View file

build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _sam3_kernels_19700101000000
+ops = torch.ops._sam3_kernels_19700101000000
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sam3_kernels_19700101000000::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0047d827c37726f56158fb43f9fbd17d75b503d327be4b8afe27e0b7cb4e7dd
+size 3018904

build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from typing import List
+from ._ops import ops
+def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
+    return ops.cc_2d(inputs, get_counts)
+def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
+    return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
+__all__ = ["cc_2d", "generic_nms"]

build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.01 kB). View file

build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (546 Bytes). View file

build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _sam3_kernels_19700101000000
+ops = torch.ops._sam3_kernels_19700101000000
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sam3_kernels_19700101000000::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39e65293594fb913dc97687e53f072926827947ffa8dcb92872448c7f53071af
+size 2991224

build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from typing import List
+from ._ops import ops
+def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
+    return ops.cc_2d(inputs, get_counts)
+def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
+    return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
+__all__ = ["cc_2d", "generic_nms"]

build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.01 kB). View file

build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (546 Bytes). View file

build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _sam3_kernels_19700101000000
+ops = torch.ops._sam3_kernels_19700101000000
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sam3_kernels_19700101000000::{op_name}"

build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cf9e7b1c4b7f0de5a963c756edebd4cdb83cceab30ef91dd12a809737180fad
+size 2554592

build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from typing import List
+from ._ops import ops
+def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
+    return ops.cc_2d(inputs, get_counts)
+def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
+    return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
+__all__ = ["cc_2d", "generic_nms"]

build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.01 kB). View file

build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (546 Bytes). View file

build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _sam3_kernels_19700101000000
+ops = torch.ops._sam3_kernels_19700101000000
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sam3_kernels_19700101000000::{op_name}"

build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b96ac17bbcfbee46864b4393e08c00009ced852372dcf45e3328d86d838dccc7
+size 3018936

build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from typing import List
+from ._ops import ops
+def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
+    return ops.cc_2d(inputs, get_counts)
+def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
+    return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
+__all__ = ["cc_2d", "generic_nms"]

build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.01 kB). View file

build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (546 Bytes). View file

build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _sam3_kernels_19700101000000
+ops = torch.ops._sam3_kernels_19700101000000
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sam3_kernels_19700101000000::{op_name}"

build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8efd2e594bca5a1a12baac389e7ce7660ba65836a2f2253570c683be1dc04f9
+size 3026784

flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1760814603,
+        "narHash": "sha256-i5uuhnJPxOrd0dC8+btp31WMfzPDL8Uwz0TPG2n6nHE=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "c0b62ec3d0abb11dd2d960e3dfee3a46fc46d111",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1761747930,
+        "narHash": "sha256-SBu3W25o5RmAKI5lw9l8ORgaQFgF9+MPHsrtcyJdddg=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "fa2380b208bf4be323a5417facf33f3c78c2e440",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1755963616,
+        "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable-small",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  description = "Flake for Torch kernel extension";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs = { self, kernel-builder, }:
+    kernel-builder.lib.genFlakeOutputs {
+        inherit self;
+        path = ./.;
+    };
+}

sam3_kernels/connected_components.cu ADDED Viewed

	@@ -0,0 +1,305 @@

+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+#include <torch/script.h>
+#include <vector>
+// 2d
+#define BLOCK_ROWS 16
+#define BLOCK_COLS 16
+namespace cc2d {
+template <typename T>
+__device__ __forceinline__ unsigned char hasBit(T bitmap, unsigned char pos) {
+  return (bitmap >> pos) & 1;
+}
+__device__ int32_t find(const int32_t* s_buf, int32_t n) {
+  while (s_buf[n] != n)
+    n = s_buf[n];
+  return n;
+}
+__device__ int32_t find_n_compress(int32_t* s_buf, int32_t n) {
+  const int32_t id = n;
+  while (s_buf[n] != n) {
+    n = s_buf[n];
+    s_buf[id] = n;
+  }
+  return n;
+}
+__device__ void union_(int32_t* s_buf, int32_t a, int32_t b) {
+  bool done;
+  do {
+    a = find(s_buf, a);
+    b = find(s_buf, b);
+    if (a < b) {
+      int32_t old = atomicMin(s_buf + b, a);
+      done = (old == b);
+      b = old;
+    } else if (b < a) {
+      int32_t old = atomicMin(s_buf + a, b);
+      done = (old == a);
+      a = old;
+    } else
+      done = true;
+  } while (!done);
+}
+__global__ void
+init_labeling(int32_t* label, const uint32_t W, const uint32_t H) {
+  const uint32_t n = blockIdx.z; // batch index
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+  const uint32_t offset = n * H * W;
+  if (row < H && col < W)
+    label[offset + idx] = idx; // each image uses local indexing, later +1
+}
+__global__ void
+merge(uint8_t* img, int32_t* label, const uint32_t W, const uint32_t H) {
+  const uint32_t n = blockIdx.z; // batch index
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+  const uint32_t offset = n * H * W;
+  if (row >= H || col >= W)
+    return;
+  uint32_t P = 0;
+  // NOTE : Original Codes, but occurs silent error
+  // NOTE : Programs keep runnig, but now showing printf logs, and the result
+  // is weird uint8_t buffer[4] = {0}; if (col + 1 < W) {
+  //     *(reinterpret_cast<uint16_t*>(buffer)) =
+  //     *(reinterpret_cast<uint16_t*>(img + idx)); if (row + 1 < H) {
+  //         *(reinterpret_cast<uint16_t*>(buffer + 2)) =
+  //         *(reinterpret_cast<uint16_t*>(img + idx + W));
+  //     }
+  // }
+  // else {
+  //     buffer[0] = img[idx];
+  //     if (row + 1 < H)
+  //         buffer[2] = img[idx + W];
+  // }
+  // if (buffer[0])              P |= 0x777;
+  // if (buffer[1])              P |= (0x777 << 1);
+  // if (buffer[2])              P |= (0x777 << 4);
+  if (img[offset + idx])
+    P |= 0x777;
+  if (row + 1 < H && img[offset + idx + W])
+    P |= 0x777 << 4;
+  if (col + 1 < W && img[offset + idx + 1])
+    P |= 0x777 << 1;
+  if (col == 0)
+    P &= 0xEEEE;
+  if (col + 1 >= W)
+    P &= 0x3333;
+  else if (col + 2 >= W)
+    P &= 0x7777;
+  if (row == 0)
+    P &= 0xFFF0;
+  if (row + 1 >= H)
+    P &= 0xFF;
+  if (P > 0) {
+    // If need check about top-left pixel(if flag the first bit) and hit the
+    // top-left pixel
+    if (hasBit(P, 0) && img[offset + idx - W - 1]) {
+      union_(label + offset, idx, idx - 2 * W - 2); // top left block
+    }
+    if ((hasBit(P, 1) && img[offset + idx - W]) ||
+        (hasBit(P, 2) && img[offset + idx - W + 1]))
+      union_(label + offset, idx, idx - 2 * W); // top bottom block
+    if (hasBit(P, 3) && img[offset + idx + 2 - W])
+      union_(label + offset, idx, idx - 2 * W + 2); // top right block
+    if ((hasBit(P, 4) && img[offset + idx - 1]) ||
+        (hasBit(P, 8) && img[offset + idx + W - 1]))
+      union_(label + offset, idx, idx - 2); // just left block
+  }
+}
+__global__ void compression(int32_t* label, const int32_t W, const int32_t H) {
+  const uint32_t n = blockIdx.z; // batch index
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+  const uint32_t offset = n * H * W;
+  if (row < H && col < W)
+    find_n_compress(label + offset, idx);
+}
+__global__ void final_labeling(
+    const uint8_t* img,
+    int32_t* label,
+    const int32_t W,
+    const int32_t H) {
+  const uint32_t n = blockIdx.z; // batch index
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+  const uint32_t offset = n * H * W;
+  if (row >= H || col >= W)
+    return;
+  int32_t y = label[offset + idx] + 1;
+  if (img[offset + idx])
+    label[offset + idx] = y;
+  else
+    label[offset + idx] = 0;
+  if (col + 1 < W) {
+    if (img[offset + idx + 1])
+      label[offset + idx + 1] = y;
+    else
+      label[offset + idx + 1] = 0;
+    if (row + 1 < H) {
+      if (img[offset + idx + W + 1])
+        label[offset + idx + W + 1] = y;
+      else
+        label[offset + idx + W + 1] = 0;
+    }
+  }
+  if (row + 1 < H) {
+    if (img[offset + idx + W])
+      label[offset + idx + W] = y;
+    else
+      label[offset + idx + W] = 0;
+  }
+}
+__global__ void init_counting(
+    const int32_t* label,
+    int32_t* count_init,
+    const int32_t W,
+    const int32_t H) {
+  const uint32_t n = blockIdx.z; // batch index
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
+  const uint32_t idx = row * W + col;
+  const uint32_t offset = n * H * W;
+  if (row >= H || col >= W)
+    return;
+  int32_t y = label[offset + idx];
+  if (y > 0) {
+    int32_t count_idx = y - 1;
+    atomicAdd(count_init + offset + count_idx, 1);
+  }
+}
+__global__ void final_counting(
+    const int32_t* label,
+    const int32_t* count_init,
+    int32_t* count_final,
+    const int32_t W,
+    const int32_t H) {
+  const uint32_t n = blockIdx.z; // batch index
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
+  const uint32_t idx = row * W + col;
+  const uint32_t offset = n * H * W;
+  if (row >= H || col >= W)
+    return;
+  int32_t y = label[offset + idx];
+  if (y > 0) {
+    int32_t count_idx = y - 1;
+    count_final[offset + idx] = count_init[offset + count_idx];
+  } else {
+    count_final[offset + idx] = 0;
+  }
+}
+} // namespace cc2d
+std::vector<torch::Tensor> connected_components_labeling_2d(
+    const torch::Tensor& inputs,
+    bool get_counts) {
+  AT_ASSERTM(inputs.is_cuda(), "inputs must be a CUDA tensor");
+  AT_ASSERTM(inputs.ndimension() == 4, "inputs must be [N, 1, H, W] shape");
+  AT_ASSERTM(
+      inputs.scalar_type() == torch::kUInt8, "inputs must be a uint8 type");
+  const uint32_t N = inputs.size(0);
+  const uint32_t C = inputs.size(1);
+  const uint32_t H = inputs.size(2);
+  const uint32_t W = inputs.size(3);
+  AT_ASSERTM(C == 1, "inputs must be [N, 1, H, W] shape");
+  AT_ASSERTM((H % 2) == 0, "height must be a even number");
+  AT_ASSERTM((W % 2) == 0, "width must be a even number");
+  // label must be uint32_t
+  auto label_options =
+      torch::TensorOptions().dtype(torch::kInt32).device(inputs.device());
+  torch::Tensor labels = torch::zeros({N, C, H, W}, label_options);
+  torch::Tensor counts_init = torch::zeros({N, C, H, W}, label_options);
+  torch::Tensor counts_final = torch::zeros({N, C, H, W}, label_options);
+  if (N == 0 || H == 0 || W == 0) {
+    // empty input masks, return an empty label and count tensor
+    // returned values are [labels, counts]
+    std::vector<torch::Tensor> outputs;
+    outputs.push_back(labels);
+    outputs.push_back(counts_final);
+    return outputs;
+  }
+  dim3 grid = dim3(
+      ((W + 1) / 2 + BLOCK_COLS - 1) / BLOCK_COLS,
+      ((H + 1) / 2 + BLOCK_ROWS - 1) / BLOCK_ROWS,
+      N);
+  dim3 block = dim3(BLOCK_COLS, BLOCK_ROWS);
+  dim3 grid_count =
+      dim3((W + BLOCK_COLS) / BLOCK_COLS, (H + BLOCK_ROWS) / BLOCK_ROWS, N);
+  dim3 block_count = dim3(BLOCK_COLS, BLOCK_ROWS);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  cc2d::init_labeling<<<grid, block, 0, stream>>>(
+      labels.data_ptr<int32_t>(), W, H);
+  cc2d::merge<<<grid, block, 0, stream>>>(
+      inputs.data_ptr<uint8_t>(), labels.data_ptr<int32_t>(), W, H);
+  cc2d::compression<<<grid, block, 0, stream>>>(
+      labels.data_ptr<int32_t>(), W, H);
+  cc2d::final_labeling<<<grid, block, 0, stream>>>(
+      inputs.data_ptr<uint8_t>(), labels.data_ptr<int32_t>(), W, H);
+  if (get_counts) {
+    cc2d::init_counting<<<grid_count, block_count, 0, stream>>>(
+        labels.data_ptr<int32_t>(), counts_init.data_ptr<int32_t>(), W, H);
+    cc2d::final_counting<<<grid_count, block_count, 0, stream>>>(
+        labels.data_ptr<int32_t>(),
+        counts_init.data_ptr<int32_t>(),
+        counts_final.data_ptr<int32_t>(),
+        W,
+        H);
+  }
+  // returned values are [labels, counts]
+  std::vector<torch::Tensor> outputs;
+  outputs.push_back(labels);
+  outputs.push_back(counts_final);
+  return outputs;
+}

sam3_kernels/generic_nms.cu ADDED Viewed

	@@ -0,0 +1,331 @@

+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/torch.h>
+#include <torch/library.h>
+namespace {
+template <typename integer>
+constexpr __host__ __device__ inline integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+template <typename T>
+__device__ inline bool
+devIoU(T const* const a, T const* const b, const float threshold) {
+  T left = max(a[0], b[0]), right = min(a[2], b[2]);
+  T top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  T width = max(right - left, (T)0), height = max(bottom - top, (T)0);
+  using acc_T = at::acc_type<T, /*is_cuda=*/true>;
+  acc_T interS = (acc_T)width * height;
+  acc_T Sa = ((acc_T)a[2] - a[0]) * (a[3] - a[1]);
+  acc_T Sb = ((acc_T)b[2] - b[0]) * (b[3] - b[1]);
+  return (interS / (Sa + Sb - interS)) > threshold;
+}
+template <typename T>
+__global__ void nms_kernel_impl(
+    int n_boxes,
+    double iou_threshold,
+    const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+  if (row_start > col_start)
+    return;
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+  __shared__ T block_boxes[threadsPerBlock * 4];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 4 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
+  }
+  __syncthreads();
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU<T>(cur_box, block_boxes + i * 4, iou_threshold)) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+template <typename T>
+__global__ void nms_kernel_iou_impl(
+    int n_boxes,
+    double iou_threshold,
+    const T* dev_iou, // [N, N] row-major IoU matrix
+    unsigned long long* dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+  if (row_start > col_start)
+    return;
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+  if (threadIdx.x < row_size) {
+    const int cur_row_idx = threadsPerBlock * row_start + threadIdx.x;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    const int col_base = threadsPerBlock * col_start;
+    for (i = start; i < col_size; i++) {
+      const int col_idx = col_base + i;
+      T iou = dev_iou[cur_row_idx * n_boxes + col_idx];
+      if (static_cast<double>(iou) > iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
+    dev_mask[cur_row_idx * col_blocks + col_start] = t;
+  }
+}
+__global__ static void gather_keep_from_mask(
+    bool* keep,
+    const unsigned long long* dev_mask,
+    const int n_boxes) {
+  // Taken and adapted from mmcv
+  // https://github.com/open-mmlab/mmcv/blob/03ce9208d18c0a63d7ffa087ea1c2f5661f2441a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh#L76
+  const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
+  const int thread_id = threadIdx.x;
+  // Mark the bboxes which have been removed.
+  extern __shared__ unsigned long long removed[];
+  // Initialize removed.
+  for (int i = thread_id; i < col_blocks; i += blockDim.x) {
+    removed[i] = 0;
+  }
+  __syncthreads();
+  for (int nblock = 0; nblock < col_blocks; nblock++) {
+    auto removed_val = removed[nblock];
+    __syncthreads();
+    const int i_offset = nblock * threadsPerBlock;
+#pragma unroll
+    for (int inblock = 0; inblock < threadsPerBlock; inblock++) {
+      const int i = i_offset + inblock;
+      if (i >= n_boxes)
+        break;
+      // Select a candidate, check if it should kept.
+      if (!(removed_val & (1ULL << inblock))) {
+        if (thread_id == 0) {
+          keep[i] = true;
+        }
+        auto p = dev_mask + i * col_blocks;
+        // Remove all bboxes which overlap the candidate.
+        for (int j = thread_id; j < col_blocks; j += blockDim.x) {
+          if (j >= nblock)
+            removed[j] |= p[j];
+        }
+        __syncthreads();
+        removed_val = removed[nblock];
+      }
+    }
+  }
+}
+// Extended op with explicit flag
+at::Tensor nms_kernel_ex(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold,
+    bool use_iou_matrix) {
+  TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor");
+  TORCH_CHECK(scores.is_cuda(), "scores must be a CUDA tensor");
+  TORCH_CHECK(
+      dets.dim() == 2,
+      "first argument should be a 2d tensor, got ",
+      dets.dim(),
+      "D");
+  TORCH_CHECK(
+      scores.dim() == 1,
+      "scores should be a 1d tensor, got ",
+      scores.dim(),
+      "D");
+  TORCH_CHECK(
+      dets.size(0) == scores.size(0),
+      "first argument and scores should have same number of elements in dimension 0, got ",
+      dets.size(0),
+      " and ",
+      scores.size(0));
+  at::cuda::CUDAGuard device_guard(dets.device());
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+  auto order_t = std::get<1>(
+      scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+  int dets_num = dets.size(0);
+  const int col_blocks = ceil_div(dets_num, threadsPerBlock);
+  at::Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (use_iou_matrix) {
+    TORCH_CHECK(
+        dets.size(0) == dets.size(1),
+        "when use_iou_matrix=True, first argument must be [N,N]");
+    auto sorted_iou =
+        dets.index_select(0, order_t).index_select(1, order_t).contiguous();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sorted_iou.scalar_type(), "nms_kernel_iou_ex", [&] {
+          nms_kernel_iou_impl<scalar_t><<<blocks, threads, 0, stream>>>(
+              dets_num,
+              iou_threshold,
+              sorted_iou.data_ptr<scalar_t>(),
+              (unsigned long long*)mask.data_ptr<int64_t>());
+        });
+  } else {
+    TORCH_CHECK(
+        dets.size(1) == 4, "when use_iou_matrix=False, boxes must be [N,4]");
+    auto dets_sorted = dets.index_select(0, order_t).contiguous();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        dets_sorted.scalar_type(), "nms_kernel_ex", [&] {
+          nms_kernel_impl<scalar_t><<<blocks, threads, 0, stream>>>(
+              dets_num,
+              iou_threshold,
+              dets_sorted.data_ptr<scalar_t>(),
+              (unsigned long long*)mask.data_ptr<int64_t>());
+        });
+  }
+  at::Tensor keep =
+      at::zeros({dets_num}, dets.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<
+      1,
+      min(col_blocks, threadsPerBlock),
+      col_blocks * sizeof(unsigned long long),
+      stream>>>(
+      keep.data_ptr<bool>(),
+      (unsigned long long*)mask.data_ptr<int64_t>(),
+      dets_num);
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.masked_select(keep);
+}
+} // namespace
+at::Tensor generic_nms(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold,
+    bool use_iou_matrix) {
+  TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor");
+  TORCH_CHECK(scores.is_cuda(), "scores must be a CUDA tensor");
+  TORCH_CHECK(
+      dets.dim() == 2,
+      "first argument should be a 2d tensor, got ",
+      dets.dim(),
+      "D");
+  TORCH_CHECK(
+      scores.dim() == 1,
+      "scores should be a 1d tensor, got ",
+      scores.dim(),
+      "D");
+  TORCH_CHECK(
+      dets.size(0) == scores.size(0),
+      "first argument and scores should have same number of elements in dimension 0, got ",
+      dets.size(0),
+      " and ",
+      scores.size(0));
+  at::cuda::CUDAGuard device_guard(dets.device());
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+  auto order_t = std::get<1>(
+      scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+  int dets_num = dets.size(0);
+  const int col_blocks = ceil_div(dets_num, threadsPerBlock);
+  at::Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (use_iou_matrix) {
+    TORCH_CHECK(
+        dets.size(0) == dets.size(1),
+        "when use_iou_matrix=True, first argument must be [N,N]");
+    auto sorted_iou =
+        dets.index_select(0, order_t).index_select(1, order_t).contiguous();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sorted_iou.scalar_type(), "nms_kernel_iou_ex", [&] {
+          nms_kernel_iou_impl<scalar_t><<<blocks, threads, 0, stream>>>(
+              dets_num,
+              iou_threshold,
+              sorted_iou.data_ptr<scalar_t>(),
+              (unsigned long long*)mask.data_ptr<int64_t>());
+        });
+  } else {
+    TORCH_CHECK(
+        dets.size(1) == 4, "when use_iou_matrix=False, boxes must be [N,4]");
+    auto dets_sorted = dets.index_select(0, order_t).contiguous();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        dets_sorted.scalar_type(), "nms_kernel_ex", [&] {
+          nms_kernel_impl<scalar_t><<<blocks, threads, 0, stream>>>(
+              dets_num,
+              iou_threshold,
+              dets_sorted.data_ptr<scalar_t>(),
+              (unsigned long long*)mask.data_ptr<int64_t>());
+        });
+  }
+  at::Tensor keep =
+      at::zeros({dets_num}, dets.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<
+      1,
+      min(col_blocks, threadsPerBlock),
+      col_blocks * sizeof(unsigned long long),
+      stream>>>(
+      keep.data_ptr<bool>(),
+      (unsigned long long*)mask.data_ptr<int64_t>(),
+      dets_num);
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.masked_select(keep);
+}

torch-ext/sam3_kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from typing import List
+from ._ops import ops
+def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
+    return ops.cc_2d(inputs, get_counts)
+def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
+    return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
+__all__ = ["cc_2d", "generic_nms"]

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,14 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("cc_2d(Tensor inputs, bool get_counts) -> Tensor[]");
+  ops.impl("cc_2d", torch::kCUDA, &connected_components_labeling_2d);
+  ops.def("generic_nms(Tensor dets, Tensor scores, float iou_threshold, bool use_iou_matrix) -> Tensor");
+  ops.impl("generic_nms", torch::kCUDA, &generic_nms);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,6 @@

+#pragma once
+#include <torch/torch.h>
+std::vector<torch::Tensor> connected_components_labeling_2d(const torch::Tensor &inputs, bool get_counts);
+torch::Tensor generic_nms(const torch::Tensor &dets, const torch::Tensor &scores, double iou_threshold, bool use_iou_matrix);