medmekk HF Staff commited on
Commit
12075d1
·
verified ·
1 Parent(s): e47b92d

Upload folder using huggingface_hub

Browse files
Files changed (40) hide show
  1. .gitattributes +6 -0
  2. README.md +10 -0
  3. build.toml +25 -0
  4. build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__init__.py +12 -0
  5. build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
  6. build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
  7. build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_ops.py +9 -0
  8. build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
  9. build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__init__.py +12 -0
  10. build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
  11. build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
  12. build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_ops.py +9 -0
  13. build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
  14. build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__init__.py +12 -0
  15. build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
  16. build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
  17. build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_ops.py +9 -0
  18. build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
  19. build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__init__.py +12 -0
  20. build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
  21. build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
  22. build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_ops.py +9 -0
  23. build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
  24. build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__init__.py +12 -0
  25. build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
  26. build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
  27. build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_ops.py +9 -0
  28. build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
  29. build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__init__.py +12 -0
  30. build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc +0 -0
  31. build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc +0 -0
  32. build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_ops.py +9 -0
  33. build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so +3 -0
  34. flake.lock +168 -0
  35. flake.nix +13 -0
  36. sam3_kernels/connected_components.cu +305 -0
  37. sam3_kernels/generic_nms.cu +331 -0
  38. torch-ext/sam3_kernels/__init__.py +12 -0
  39. torch-ext/torch_binding.cpp +14 -0
  40. torch-ext/torch_binding.h +6 -0
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
37
+ build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
38
+ build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
39
+ build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
40
+ build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
41
+ build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - kernels
4
+ - sam3
5
+ ---
6
+
7
+ # sam3_kernels
8
+
9
+ This is a build for some kernel utilities that are used in the SAM3 model in transformers
10
+
build.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [general]
2
+ name = "sam3_kernels"
3
+ universal = false
4
+
5
+ [torch]
6
+ src = [
7
+ "torch-ext/torch_binding.cpp",
8
+ "torch-ext/torch_binding.h",
9
+ ]
10
+
11
+ [kernel.sam3_kernels]
12
+ depends = ["torch"]
13
+ backend = "cuda"
14
+
15
+ src = [
16
+ "sam3_kernels/connected_components.cu",
17
+ "sam3_kernels/generic_nms.cu",
18
+ ]
19
+
20
+ cuda-flags = [
21
+ "-DCUDA_HAS_FP16=1",
22
+ "-D__CUDA_NO_HALF_OPERATORS__",
23
+ "-D__CUDA_NO_HALF_CONVERSIONS__",
24
+ "-D__CUDA_NO_HALF2_OPERATORS__",
25
+ ]
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import List
3
+
4
+ from ._ops import ops
5
+
6
+ def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
7
+ return ops.cc_2d(inputs, get_counts)
8
+
9
+ def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
10
+ return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
11
+
12
+ __all__ = ["cc_2d", "generic_nms"]
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.01 kB). View file
 
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED
Binary file (546 Bytes). View file
 
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _sam3_kernels_19700101000000
3
+ ops = torch.ops._sam3_kernels_19700101000000
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_sam3_kernels_19700101000000::{op_name}"
build/torch28-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df6a1fdcf8c683e752af841ae9faa83e5b8b16e97fcc88d643b443e67c4714e
3
+ size 2550384
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import List
3
+
4
+ from ._ops import ops
5
+
6
+ def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
7
+ return ops.cc_2d(inputs, get_counts)
8
+
9
+ def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
10
+ return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
11
+
12
+ __all__ = ["cc_2d", "generic_nms"]
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.01 kB). View file
 
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED
Binary file (546 Bytes). View file
 
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _sam3_kernels_19700101000000
3
+ ops = torch.ops._sam3_kernels_19700101000000
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_sam3_kernels_19700101000000::{op_name}"
build/torch28-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0047d827c37726f56158fb43f9fbd17d75b503d327be4b8afe27e0b7cb4e7dd
3
+ size 3018904
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import List
3
+
4
+ from ._ops import ops
5
+
6
+ def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
7
+ return ops.cc_2d(inputs, get_counts)
8
+
9
+ def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
10
+ return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
11
+
12
+ __all__ = ["cc_2d", "generic_nms"]
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.01 kB). View file
 
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED
Binary file (546 Bytes). View file
 
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _sam3_kernels_19700101000000
3
+ ops = torch.ops._sam3_kernels_19700101000000
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_sam3_kernels_19700101000000::{op_name}"
build/torch28-cxx11-cu129-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e65293594fb913dc97687e53f072926827947ffa8dcb92872448c7f53071af
3
+ size 2991224
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import List
3
+
4
+ from ._ops import ops
5
+
6
+ def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
7
+ return ops.cc_2d(inputs, get_counts)
8
+
9
+ def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
10
+ return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
11
+
12
+ __all__ = ["cc_2d", "generic_nms"]
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.01 kB). View file
 
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED
Binary file (546 Bytes). View file
 
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _sam3_kernels_19700101000000
3
+ ops = torch.ops._sam3_kernels_19700101000000
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_sam3_kernels_19700101000000::{op_name}"
build/torch29-cxx11-cu126-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cf9e7b1c4b7f0de5a963c756edebd4cdb83cceab30ef91dd12a809737180fad
3
+ size 2554592
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import List
3
+
4
+ from ._ops import ops
5
+
6
+ def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
7
+ return ops.cc_2d(inputs, get_counts)
8
+
9
+ def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
10
+ return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
11
+
12
+ __all__ = ["cc_2d", "generic_nms"]
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.01 kB). View file
 
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED
Binary file (546 Bytes). View file
 
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _sam3_kernels_19700101000000
3
+ ops = torch.ops._sam3_kernels_19700101000000
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_sam3_kernels_19700101000000::{op_name}"
build/torch29-cxx11-cu128-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b96ac17bbcfbee46864b4393e08c00009ced852372dcf45e3328d86d838dccc7
3
+ size 3018936
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import List
3
+
4
+ from ._ops import ops
5
+
6
+ def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
7
+ return ops.cc_2d(inputs, get_counts)
8
+
9
+ def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
10
+ return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
11
+
12
+ __all__ = ["cc_2d", "generic_nms"]
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.01 kB). View file
 
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/__pycache__/_ops.cpython-313.pyc ADDED
Binary file (546 Bytes). View file
 
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _sam3_kernels_19700101000000
3
+ ops = torch.ops._sam3_kernels_19700101000000
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_sam3_kernels_19700101000000::{op_name}"
build/torch29-cxx11-cu130-x86_64-linux/sam3_kernels/_sam3_kernels_19700101000000.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8efd2e594bca5a1a12baac389e7ce7660ba65836a2f2253570c683be1dc04f9
3
+ size 3026784
flake.lock ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nodes": {
3
+ "flake-compat": {
4
+ "locked": {
5
+ "lastModified": 1747046372,
6
+ "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
7
+ "owner": "edolstra",
8
+ "repo": "flake-compat",
9
+ "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
10
+ "type": "github"
11
+ },
12
+ "original": {
13
+ "owner": "edolstra",
14
+ "repo": "flake-compat",
15
+ "type": "github"
16
+ }
17
+ },
18
+ "flake-compat_2": {
19
+ "locked": {
20
+ "lastModified": 1747046372,
21
+ "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
22
+ "owner": "edolstra",
23
+ "repo": "flake-compat",
24
+ "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
25
+ "type": "github"
26
+ },
27
+ "original": {
28
+ "owner": "edolstra",
29
+ "repo": "flake-compat",
30
+ "type": "github"
31
+ }
32
+ },
33
+ "flake-utils": {
34
+ "inputs": {
35
+ "systems": "systems"
36
+ },
37
+ "locked": {
38
+ "lastModified": 1731533236,
39
+ "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
40
+ "owner": "numtide",
41
+ "repo": "flake-utils",
42
+ "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
43
+ "type": "github"
44
+ },
45
+ "original": {
46
+ "owner": "numtide",
47
+ "repo": "flake-utils",
48
+ "type": "github"
49
+ }
50
+ },
51
+ "flake-utils_2": {
52
+ "inputs": {
53
+ "systems": "systems_2"
54
+ },
55
+ "locked": {
56
+ "lastModified": 1731533236,
57
+ "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
58
+ "owner": "numtide",
59
+ "repo": "flake-utils",
60
+ "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
61
+ "type": "github"
62
+ },
63
+ "original": {
64
+ "owner": "numtide",
65
+ "repo": "flake-utils",
66
+ "type": "github"
67
+ }
68
+ },
69
+ "hf-nix": {
70
+ "inputs": {
71
+ "flake-compat": "flake-compat_2",
72
+ "flake-utils": "flake-utils_2",
73
+ "nixpkgs": "nixpkgs"
74
+ },
75
+ "locked": {
76
+ "lastModified": 1760814603,
77
+ "narHash": "sha256-i5uuhnJPxOrd0dC8+btp31WMfzPDL8Uwz0TPG2n6nHE=",
78
+ "owner": "huggingface",
79
+ "repo": "hf-nix",
80
+ "rev": "c0b62ec3d0abb11dd2d960e3dfee3a46fc46d111",
81
+ "type": "github"
82
+ },
83
+ "original": {
84
+ "owner": "huggingface",
85
+ "repo": "hf-nix",
86
+ "type": "github"
87
+ }
88
+ },
89
+ "kernel-builder": {
90
+ "inputs": {
91
+ "flake-compat": "flake-compat",
92
+ "flake-utils": "flake-utils",
93
+ "hf-nix": "hf-nix",
94
+ "nixpkgs": [
95
+ "kernel-builder",
96
+ "hf-nix",
97
+ "nixpkgs"
98
+ ]
99
+ },
100
+ "locked": {
101
+ "lastModified": 1761747930,
102
+ "narHash": "sha256-SBu3W25o5RmAKI5lw9l8ORgaQFgF9+MPHsrtcyJdddg=",
103
+ "owner": "huggingface",
104
+ "repo": "kernel-builder",
105
+ "rev": "fa2380b208bf4be323a5417facf33f3c78c2e440",
106
+ "type": "github"
107
+ },
108
+ "original": {
109
+ "owner": "huggingface",
110
+ "repo": "kernel-builder",
111
+ "type": "github"
112
+ }
113
+ },
114
+ "nixpkgs": {
115
+ "locked": {
116
+ "lastModified": 1755963616,
117
+ "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
118
+ "owner": "nixos",
119
+ "repo": "nixpkgs",
120
+ "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
121
+ "type": "github"
122
+ },
123
+ "original": {
124
+ "owner": "nixos",
125
+ "ref": "nixos-unstable-small",
126
+ "repo": "nixpkgs",
127
+ "type": "github"
128
+ }
129
+ },
130
+ "root": {
131
+ "inputs": {
132
+ "kernel-builder": "kernel-builder"
133
+ }
134
+ },
135
+ "systems": {
136
+ "locked": {
137
+ "lastModified": 1681028828,
138
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
139
+ "owner": "nix-systems",
140
+ "repo": "default",
141
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
142
+ "type": "github"
143
+ },
144
+ "original": {
145
+ "owner": "nix-systems",
146
+ "repo": "default",
147
+ "type": "github"
148
+ }
149
+ },
150
+ "systems_2": {
151
+ "locked": {
152
+ "lastModified": 1681028828,
153
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
154
+ "owner": "nix-systems",
155
+ "repo": "default",
156
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
157
+ "type": "github"
158
+ },
159
+ "original": {
160
+ "owner": "nix-systems",
161
+ "repo": "default",
162
+ "type": "github"
163
+ }
164
+ }
165
+ },
166
+ "root": "root",
167
+ "version": 7
168
+ }
flake.nix ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ description = "Flake for Torch kernel extension";
3
+
4
+ inputs = {
5
+ kernel-builder.url = "github:huggingface/kernel-builder";
6
+ };
7
+
8
+ outputs = { self, kernel-builder, }:
9
+ kernel-builder.lib.genFlakeOutputs {
10
+ inherit self;
11
+ path = ./.;
12
+ };
13
+ }
sam3_kernels/connected_components.cu ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/cuda/CUDAContext.h>
2
+ #include <cuda.h>
3
+ #include <cuda_runtime.h>
4
+ #include <torch/torch.h>
5
+ #include <torch/script.h>
6
+ #include <vector>
7
+
8
+ // 2d
9
+ #define BLOCK_ROWS 16
10
+ #define BLOCK_COLS 16
11
+
12
+ namespace cc2d {
13
+
14
+ template <typename T>
15
+ __device__ __forceinline__ unsigned char hasBit(T bitmap, unsigned char pos) {
16
+ return (bitmap >> pos) & 1;
17
+ }
18
+
19
+ __device__ int32_t find(const int32_t* s_buf, int32_t n) {
20
+ while (s_buf[n] != n)
21
+ n = s_buf[n];
22
+ return n;
23
+ }
24
+
25
+ __device__ int32_t find_n_compress(int32_t* s_buf, int32_t n) {
26
+ const int32_t id = n;
27
+ while (s_buf[n] != n) {
28
+ n = s_buf[n];
29
+ s_buf[id] = n;
30
+ }
31
+ return n;
32
+ }
33
+
34
+ __device__ void union_(int32_t* s_buf, int32_t a, int32_t b) {
35
+ bool done;
36
+ do {
37
+ a = find(s_buf, a);
38
+ b = find(s_buf, b);
39
+
40
+ if (a < b) {
41
+ int32_t old = atomicMin(s_buf + b, a);
42
+ done = (old == b);
43
+ b = old;
44
+ } else if (b < a) {
45
+ int32_t old = atomicMin(s_buf + a, b);
46
+ done = (old == a);
47
+ a = old;
48
+ } else
49
+ done = true;
50
+
51
+ } while (!done);
52
+ }
53
+
54
+ __global__ void
55
+ init_labeling(int32_t* label, const uint32_t W, const uint32_t H) {
56
+ const uint32_t n = blockIdx.z; // batch index
57
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
58
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
59
+ const uint32_t idx = row * W + col;
60
+ const uint32_t offset = n * H * W;
61
+
62
+ if (row < H && col < W)
63
+ label[offset + idx] = idx; // each image uses local indexing, later +1
64
+ }
65
+
66
+ __global__ void
67
+ merge(uint8_t* img, int32_t* label, const uint32_t W, const uint32_t H) {
68
+ const uint32_t n = blockIdx.z; // batch index
69
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
70
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
71
+ const uint32_t idx = row * W + col;
72
+ const uint32_t offset = n * H * W;
73
+
74
+ if (row >= H || col >= W)
75
+ return;
76
+
77
+ uint32_t P = 0;
78
+
79
+ // NOTE : Original Codes, but occurs silent error
80
+ // NOTE : Programs keep runnig, but now showing printf logs, and the result
81
+ // is weird uint8_t buffer[4] = {0}; if (col + 1 < W) {
82
+ // *(reinterpret_cast<uint16_t*>(buffer)) =
83
+ // *(reinterpret_cast<uint16_t*>(img + idx)); if (row + 1 < H) {
84
+ // *(reinterpret_cast<uint16_t*>(buffer + 2)) =
85
+ // *(reinterpret_cast<uint16_t*>(img + idx + W));
86
+ // }
87
+ // }
88
+ // else {
89
+ // buffer[0] = img[idx];
90
+ // if (row + 1 < H)
91
+ // buffer[2] = img[idx + W];
92
+ // }
93
+ // if (buffer[0]) P |= 0x777;
94
+ // if (buffer[1]) P |= (0x777 << 1);
95
+ // if (buffer[2]) P |= (0x777 << 4);
96
+
97
+ if (img[offset + idx])
98
+ P |= 0x777;
99
+ if (row + 1 < H && img[offset + idx + W])
100
+ P |= 0x777 << 4;
101
+ if (col + 1 < W && img[offset + idx + 1])
102
+ P |= 0x777 << 1;
103
+
104
+ if (col == 0)
105
+ P &= 0xEEEE;
106
+ if (col + 1 >= W)
107
+ P &= 0x3333;
108
+ else if (col + 2 >= W)
109
+ P &= 0x7777;
110
+
111
+ if (row == 0)
112
+ P &= 0xFFF0;
113
+ if (row + 1 >= H)
114
+ P &= 0xFF;
115
+
116
+ if (P > 0) {
117
+ // If need check about top-left pixel(if flag the first bit) and hit the
118
+ // top-left pixel
119
+ if (hasBit(P, 0) && img[offset + idx - W - 1]) {
120
+ union_(label + offset, idx, idx - 2 * W - 2); // top left block
121
+ }
122
+
123
+ if ((hasBit(P, 1) && img[offset + idx - W]) ||
124
+ (hasBit(P, 2) && img[offset + idx - W + 1]))
125
+ union_(label + offset, idx, idx - 2 * W); // top bottom block
126
+
127
+ if (hasBit(P, 3) && img[offset + idx + 2 - W])
128
+ union_(label + offset, idx, idx - 2 * W + 2); // top right block
129
+
130
+ if ((hasBit(P, 4) && img[offset + idx - 1]) ||
131
+ (hasBit(P, 8) && img[offset + idx + W - 1]))
132
+ union_(label + offset, idx, idx - 2); // just left block
133
+ }
134
+ }
135
+
136
+ __global__ void compression(int32_t* label, const int32_t W, const int32_t H) {
137
+ const uint32_t n = blockIdx.z; // batch index
138
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
139
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
140
+ const uint32_t idx = row * W + col;
141
+ const uint32_t offset = n * H * W;
142
+
143
+ if (row < H && col < W)
144
+ find_n_compress(label + offset, idx);
145
+ }
146
+
147
+ __global__ void final_labeling(
148
+ const uint8_t* img,
149
+ int32_t* label,
150
+ const int32_t W,
151
+ const int32_t H) {
152
+ const uint32_t n = blockIdx.z; // batch index
153
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
154
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
155
+ const uint32_t idx = row * W + col;
156
+ const uint32_t offset = n * H * W;
157
+
158
+ if (row >= H || col >= W)
159
+ return;
160
+
161
+ int32_t y = label[offset + idx] + 1;
162
+
163
+ if (img[offset + idx])
164
+ label[offset + idx] = y;
165
+ else
166
+ label[offset + idx] = 0;
167
+
168
+ if (col + 1 < W) {
169
+ if (img[offset + idx + 1])
170
+ label[offset + idx + 1] = y;
171
+ else
172
+ label[offset + idx + 1] = 0;
173
+
174
+ if (row + 1 < H) {
175
+ if (img[offset + idx + W + 1])
176
+ label[offset + idx + W + 1] = y;
177
+ else
178
+ label[offset + idx + W + 1] = 0;
179
+ }
180
+ }
181
+
182
+ if (row + 1 < H) {
183
+ if (img[offset + idx + W])
184
+ label[offset + idx + W] = y;
185
+ else
186
+ label[offset + idx + W] = 0;
187
+ }
188
+ }
189
+
190
+ __global__ void init_counting(
191
+ const int32_t* label,
192
+ int32_t* count_init,
193
+ const int32_t W,
194
+ const int32_t H) {
195
+ const uint32_t n = blockIdx.z; // batch index
196
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
197
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
198
+ const uint32_t idx = row * W + col;
199
+ const uint32_t offset = n * H * W;
200
+
201
+ if (row >= H || col >= W)
202
+ return;
203
+
204
+ int32_t y = label[offset + idx];
205
+ if (y > 0) {
206
+ int32_t count_idx = y - 1;
207
+ atomicAdd(count_init + offset + count_idx, 1);
208
+ }
209
+ }
210
+
211
+ __global__ void final_counting(
212
+ const int32_t* label,
213
+ const int32_t* count_init,
214
+ int32_t* count_final,
215
+ const int32_t W,
216
+ const int32_t H) {
217
+ const uint32_t n = blockIdx.z; // batch index
218
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
219
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
220
+ const uint32_t idx = row * W + col;
221
+ const uint32_t offset = n * H * W;
222
+
223
+ if (row >= H || col >= W)
224
+ return;
225
+
226
+ int32_t y = label[offset + idx];
227
+ if (y > 0) {
228
+ int32_t count_idx = y - 1;
229
+ count_final[offset + idx] = count_init[offset + count_idx];
230
+ } else {
231
+ count_final[offset + idx] = 0;
232
+ }
233
+ }
234
+
235
+ } // namespace cc2d
236
+
237
+ std::vector<torch::Tensor> connected_components_labeling_2d(
238
+ const torch::Tensor& inputs,
239
+ bool get_counts) {
240
+ AT_ASSERTM(inputs.is_cuda(), "inputs must be a CUDA tensor");
241
+ AT_ASSERTM(inputs.ndimension() == 4, "inputs must be [N, 1, H, W] shape");
242
+ AT_ASSERTM(
243
+ inputs.scalar_type() == torch::kUInt8, "inputs must be a uint8 type");
244
+
245
+ const uint32_t N = inputs.size(0);
246
+ const uint32_t C = inputs.size(1);
247
+ const uint32_t H = inputs.size(2);
248
+ const uint32_t W = inputs.size(3);
249
+
250
+ AT_ASSERTM(C == 1, "inputs must be [N, 1, H, W] shape");
251
+ AT_ASSERTM((H % 2) == 0, "height must be a even number");
252
+ AT_ASSERTM((W % 2) == 0, "width must be a even number");
253
+
254
+ // label must be uint32_t
255
+ auto label_options =
256
+ torch::TensorOptions().dtype(torch::kInt32).device(inputs.device());
257
+ torch::Tensor labels = torch::zeros({N, C, H, W}, label_options);
258
+ torch::Tensor counts_init = torch::zeros({N, C, H, W}, label_options);
259
+ torch::Tensor counts_final = torch::zeros({N, C, H, W}, label_options);
260
+
261
+ if (N == 0 || H == 0 || W == 0) {
262
+ // empty input masks, return an empty label and count tensor
263
+ // returned values are [labels, counts]
264
+ std::vector<torch::Tensor> outputs;
265
+ outputs.push_back(labels);
266
+ outputs.push_back(counts_final);
267
+ return outputs;
268
+ }
269
+
270
+ dim3 grid = dim3(
271
+ ((W + 1) / 2 + BLOCK_COLS - 1) / BLOCK_COLS,
272
+ ((H + 1) / 2 + BLOCK_ROWS - 1) / BLOCK_ROWS,
273
+ N);
274
+ dim3 block = dim3(BLOCK_COLS, BLOCK_ROWS);
275
+ dim3 grid_count =
276
+ dim3((W + BLOCK_COLS) / BLOCK_COLS, (H + BLOCK_ROWS) / BLOCK_ROWS, N);
277
+ dim3 block_count = dim3(BLOCK_COLS, BLOCK_ROWS);
278
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
279
+
280
+ cc2d::init_labeling<<<grid, block, 0, stream>>>(
281
+ labels.data_ptr<int32_t>(), W, H);
282
+ cc2d::merge<<<grid, block, 0, stream>>>(
283
+ inputs.data_ptr<uint8_t>(), labels.data_ptr<int32_t>(), W, H);
284
+ cc2d::compression<<<grid, block, 0, stream>>>(
285
+ labels.data_ptr<int32_t>(), W, H);
286
+ cc2d::final_labeling<<<grid, block, 0, stream>>>(
287
+ inputs.data_ptr<uint8_t>(), labels.data_ptr<int32_t>(), W, H);
288
+
289
+ if (get_counts) {
290
+ cc2d::init_counting<<<grid_count, block_count, 0, stream>>>(
291
+ labels.data_ptr<int32_t>(), counts_init.data_ptr<int32_t>(), W, H);
292
+ cc2d::final_counting<<<grid_count, block_count, 0, stream>>>(
293
+ labels.data_ptr<int32_t>(),
294
+ counts_init.data_ptr<int32_t>(),
295
+ counts_final.data_ptr<int32_t>(),
296
+ W,
297
+ H);
298
+ }
299
+
300
+ // returned values are [labels, counts]
301
+ std::vector<torch::Tensor> outputs;
302
+ outputs.push_back(labels);
303
+ outputs.push_back(counts_final);
304
+ return outputs;
305
+ }
sam3_kernels/generic_nms.cu ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/ATen.h>
2
+ #include <ATen/AccumulateType.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+ #include <c10/cuda/CUDAGuard.h>
5
+ #include <torch/torch.h>
6
+ #include <torch/library.h>
7
+
8
+ namespace {
9
+
10
+ template <typename integer>
11
+ constexpr __host__ __device__ inline integer ceil_div(integer n, integer m) {
12
+ return (n + m - 1) / m;
13
+ }
14
+
15
+ int const threadsPerBlock = sizeof(unsigned long long) * 8;
16
+
17
+ template <typename T>
18
+ __device__ inline bool
19
+ devIoU(T const* const a, T const* const b, const float threshold) {
20
+ T left = max(a[0], b[0]), right = min(a[2], b[2]);
21
+ T top = max(a[1], b[1]), bottom = min(a[3], b[3]);
22
+ T width = max(right - left, (T)0), height = max(bottom - top, (T)0);
23
+ using acc_T = at::acc_type<T, /*is_cuda=*/true>;
24
+ acc_T interS = (acc_T)width * height;
25
+ acc_T Sa = ((acc_T)a[2] - a[0]) * (a[3] - a[1]);
26
+ acc_T Sb = ((acc_T)b[2] - b[0]) * (b[3] - b[1]);
27
+ return (interS / (Sa + Sb - interS)) > threshold;
28
+ }
29
+
30
+ template <typename T>
31
+ __global__ void nms_kernel_impl(
32
+ int n_boxes,
33
+ double iou_threshold,
34
+ const T* dev_boxes,
35
+ unsigned long long* dev_mask) {
36
+ const int row_start = blockIdx.y;
37
+ const int col_start = blockIdx.x;
38
+
39
+ if (row_start > col_start)
40
+ return;
41
+
42
+ const int row_size =
43
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
44
+ const int col_size =
45
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
46
+
47
+ __shared__ T block_boxes[threadsPerBlock * 4];
48
+ if (threadIdx.x < col_size) {
49
+ block_boxes[threadIdx.x * 4 + 0] =
50
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
51
+ block_boxes[threadIdx.x * 4 + 1] =
52
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
53
+ block_boxes[threadIdx.x * 4 + 2] =
54
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
55
+ block_boxes[threadIdx.x * 4 + 3] =
56
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
57
+ }
58
+ __syncthreads();
59
+
60
+ if (threadIdx.x < row_size) {
61
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
62
+ const T* cur_box = dev_boxes + cur_box_idx * 4;
63
+ int i = 0;
64
+ unsigned long long t = 0;
65
+ int start = 0;
66
+ if (row_start == col_start) {
67
+ start = threadIdx.x + 1;
68
+ }
69
+ for (i = start; i < col_size; i++) {
70
+ if (devIoU<T>(cur_box, block_boxes + i * 4, iou_threshold)) {
71
+ t |= 1ULL << i;
72
+ }
73
+ }
74
+ const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
75
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
76
+ }
77
+ }
78
+
79
+ template <typename T>
80
+ __global__ void nms_kernel_iou_impl(
81
+ int n_boxes,
82
+ double iou_threshold,
83
+ const T* dev_iou, // [N, N] row-major IoU matrix
84
+ unsigned long long* dev_mask) {
85
+ const int row_start = blockIdx.y;
86
+ const int col_start = blockIdx.x;
87
+
88
+ if (row_start > col_start)
89
+ return;
90
+
91
+ const int row_size =
92
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
93
+ const int col_size =
94
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
95
+
96
+ if (threadIdx.x < row_size) {
97
+ const int cur_row_idx = threadsPerBlock * row_start + threadIdx.x;
98
+ int i = 0;
99
+ unsigned long long t = 0;
100
+ int start = 0;
101
+ if (row_start == col_start) {
102
+ start = threadIdx.x + 1;
103
+ }
104
+ const int col_base = threadsPerBlock * col_start;
105
+ for (i = start; i < col_size; i++) {
106
+ const int col_idx = col_base + i;
107
+ T iou = dev_iou[cur_row_idx * n_boxes + col_idx];
108
+ if (static_cast<double>(iou) > iou_threshold) {
109
+ t |= 1ULL << i;
110
+ }
111
+ }
112
+ const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
113
+ dev_mask[cur_row_idx * col_blocks + col_start] = t;
114
+ }
115
+ }
116
+
117
+ __global__ static void gather_keep_from_mask(
118
+ bool* keep,
119
+ const unsigned long long* dev_mask,
120
+ const int n_boxes) {
121
+ // Taken and adapted from mmcv
122
+ // https://github.com/open-mmlab/mmcv/blob/03ce9208d18c0a63d7ffa087ea1c2f5661f2441a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh#L76
123
+ const int col_blocks = ceil_div(n_boxes, threadsPerBlock);
124
+ const int thread_id = threadIdx.x;
125
+
126
+ // Mark the bboxes which have been removed.
127
+ extern __shared__ unsigned long long removed[];
128
+
129
+ // Initialize removed.
130
+ for (int i = thread_id; i < col_blocks; i += blockDim.x) {
131
+ removed[i] = 0;
132
+ }
133
+ __syncthreads();
134
+
135
+ for (int nblock = 0; nblock < col_blocks; nblock++) {
136
+ auto removed_val = removed[nblock];
137
+ __syncthreads();
138
+ const int i_offset = nblock * threadsPerBlock;
139
+ #pragma unroll
140
+ for (int inblock = 0; inblock < threadsPerBlock; inblock++) {
141
+ const int i = i_offset + inblock;
142
+ if (i >= n_boxes)
143
+ break;
144
+ // Select a candidate, check if it should kept.
145
+ if (!(removed_val & (1ULL << inblock))) {
146
+ if (thread_id == 0) {
147
+ keep[i] = true;
148
+ }
149
+ auto p = dev_mask + i * col_blocks;
150
+ // Remove all bboxes which overlap the candidate.
151
+ for (int j = thread_id; j < col_blocks; j += blockDim.x) {
152
+ if (j >= nblock)
153
+ removed[j] |= p[j];
154
+ }
155
+ __syncthreads();
156
+ removed_val = removed[nblock];
157
+ }
158
+ }
159
+ }
160
+ }
161
+
162
+ // Extended op with explicit flag
163
+ at::Tensor nms_kernel_ex(
164
+ const at::Tensor& dets,
165
+ const at::Tensor& scores,
166
+ double iou_threshold,
167
+ bool use_iou_matrix) {
168
+ TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor");
169
+ TORCH_CHECK(scores.is_cuda(), "scores must be a CUDA tensor");
170
+ TORCH_CHECK(
171
+ dets.dim() == 2,
172
+ "first argument should be a 2d tensor, got ",
173
+ dets.dim(),
174
+ "D");
175
+ TORCH_CHECK(
176
+ scores.dim() == 1,
177
+ "scores should be a 1d tensor, got ",
178
+ scores.dim(),
179
+ "D");
180
+ TORCH_CHECK(
181
+ dets.size(0) == scores.size(0),
182
+ "first argument and scores should have same number of elements in dimension 0, got ",
183
+ dets.size(0),
184
+ " and ",
185
+ scores.size(0));
186
+
187
+ at::cuda::CUDAGuard device_guard(dets.device());
188
+
189
+ if (dets.numel() == 0) {
190
+ return at::empty({0}, dets.options().dtype(at::kLong));
191
+ }
192
+
193
+ auto order_t = std::get<1>(
194
+ scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
195
+ int dets_num = dets.size(0);
196
+ const int col_blocks = ceil_div(dets_num, threadsPerBlock);
197
+
198
+ at::Tensor mask =
199
+ at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
200
+ dim3 blocks(col_blocks, col_blocks);
201
+ dim3 threads(threadsPerBlock);
202
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
203
+
204
+ if (use_iou_matrix) {
205
+ TORCH_CHECK(
206
+ dets.size(0) == dets.size(1),
207
+ "when use_iou_matrix=True, first argument must be [N,N]");
208
+ auto sorted_iou =
209
+ dets.index_select(0, order_t).index_select(1, order_t).contiguous();
210
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
211
+ sorted_iou.scalar_type(), "nms_kernel_iou_ex", [&] {
212
+ nms_kernel_iou_impl<scalar_t><<<blocks, threads, 0, stream>>>(
213
+ dets_num,
214
+ iou_threshold,
215
+ sorted_iou.data_ptr<scalar_t>(),
216
+ (unsigned long long*)mask.data_ptr<int64_t>());
217
+ });
218
+ } else {
219
+ TORCH_CHECK(
220
+ dets.size(1) == 4, "when use_iou_matrix=False, boxes must be [N,4]");
221
+ auto dets_sorted = dets.index_select(0, order_t).contiguous();
222
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
223
+ dets_sorted.scalar_type(), "nms_kernel_ex", [&] {
224
+ nms_kernel_impl<scalar_t><<<blocks, threads, 0, stream>>>(
225
+ dets_num,
226
+ iou_threshold,
227
+ dets_sorted.data_ptr<scalar_t>(),
228
+ (unsigned long long*)mask.data_ptr<int64_t>());
229
+ });
230
+ }
231
+
232
+ at::Tensor keep =
233
+ at::zeros({dets_num}, dets.options().dtype(at::kBool).device(at::kCUDA));
234
+ gather_keep_from_mask<<<
235
+ 1,
236
+ min(col_blocks, threadsPerBlock),
237
+ col_blocks * sizeof(unsigned long long),
238
+ stream>>>(
239
+ keep.data_ptr<bool>(),
240
+ (unsigned long long*)mask.data_ptr<int64_t>(),
241
+ dets_num);
242
+
243
+ AT_CUDA_CHECK(cudaGetLastError());
244
+ return order_t.masked_select(keep);
245
+ }
246
+
247
+ } // namespace
248
+
249
+ at::Tensor generic_nms(
250
+ const at::Tensor& dets,
251
+ const at::Tensor& scores,
252
+ double iou_threshold,
253
+ bool use_iou_matrix) {
254
+ TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor");
255
+ TORCH_CHECK(scores.is_cuda(), "scores must be a CUDA tensor");
256
+ TORCH_CHECK(
257
+ dets.dim() == 2,
258
+ "first argument should be a 2d tensor, got ",
259
+ dets.dim(),
260
+ "D");
261
+ TORCH_CHECK(
262
+ scores.dim() == 1,
263
+ "scores should be a 1d tensor, got ",
264
+ scores.dim(),
265
+ "D");
266
+ TORCH_CHECK(
267
+ dets.size(0) == scores.size(0),
268
+ "first argument and scores should have same number of elements in dimension 0, got ",
269
+ dets.size(0),
270
+ " and ",
271
+ scores.size(0));
272
+
273
+ at::cuda::CUDAGuard device_guard(dets.device());
274
+
275
+ if (dets.numel() == 0) {
276
+ return at::empty({0}, dets.options().dtype(at::kLong));
277
+ }
278
+
279
+ auto order_t = std::get<1>(
280
+ scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
281
+ int dets_num = dets.size(0);
282
+ const int col_blocks = ceil_div(dets_num, threadsPerBlock);
283
+
284
+ at::Tensor mask =
285
+ at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
286
+ dim3 blocks(col_blocks, col_blocks);
287
+ dim3 threads(threadsPerBlock);
288
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
289
+
290
+ if (use_iou_matrix) {
291
+ TORCH_CHECK(
292
+ dets.size(0) == dets.size(1),
293
+ "when use_iou_matrix=True, first argument must be [N,N]");
294
+ auto sorted_iou =
295
+ dets.index_select(0, order_t).index_select(1, order_t).contiguous();
296
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
297
+ sorted_iou.scalar_type(), "nms_kernel_iou_ex", [&] {
298
+ nms_kernel_iou_impl<scalar_t><<<blocks, threads, 0, stream>>>(
299
+ dets_num,
300
+ iou_threshold,
301
+ sorted_iou.data_ptr<scalar_t>(),
302
+ (unsigned long long*)mask.data_ptr<int64_t>());
303
+ });
304
+ } else {
305
+ TORCH_CHECK(
306
+ dets.size(1) == 4, "when use_iou_matrix=False, boxes must be [N,4]");
307
+ auto dets_sorted = dets.index_select(0, order_t).contiguous();
308
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
309
+ dets_sorted.scalar_type(), "nms_kernel_ex", [&] {
310
+ nms_kernel_impl<scalar_t><<<blocks, threads, 0, stream>>>(
311
+ dets_num,
312
+ iou_threshold,
313
+ dets_sorted.data_ptr<scalar_t>(),
314
+ (unsigned long long*)mask.data_ptr<int64_t>());
315
+ });
316
+ }
317
+
318
+ at::Tensor keep =
319
+ at::zeros({dets_num}, dets.options().dtype(at::kBool).device(at::kCUDA));
320
+ gather_keep_from_mask<<<
321
+ 1,
322
+ min(col_blocks, threadsPerBlock),
323
+ col_blocks * sizeof(unsigned long long),
324
+ stream>>>(
325
+ keep.data_ptr<bool>(),
326
+ (unsigned long long*)mask.data_ptr<int64_t>(),
327
+ dets_num);
328
+
329
+ AT_CUDA_CHECK(cudaGetLastError());
330
+ return order_t.masked_select(keep);
331
+ }
torch-ext/sam3_kernels/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import List
3
+
4
+ from ._ops import ops
5
+
6
+ def cc_2d(inputs: torch.Tensor, get_counts: bool) -> List[torch.Tensor]:
7
+ return ops.cc_2d(inputs, get_counts)
8
+
9
+ def generic_nms(dets: torch.Tensor, scores: torch.Tensor, iou_threshold: float, use_iou_matrix: bool) -> torch.Tensor:
10
+ return ops.generic_nms(dets, scores, iou_threshold, use_iou_matrix)
11
+
12
+ __all__ = ["cc_2d", "generic_nms"]
torch-ext/torch_binding.cpp ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/library.h>
2
+
3
+ #include "registration.h"
4
+ #include "torch_binding.h"
5
+
6
+ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
7
+ ops.def("cc_2d(Tensor inputs, bool get_counts) -> Tensor[]");
8
+ ops.impl("cc_2d", torch::kCUDA, &connected_components_labeling_2d);
9
+
10
+ ops.def("generic_nms(Tensor dets, Tensor scores, float iou_threshold, bool use_iou_matrix) -> Tensor");
11
+ ops.impl("generic_nms", torch::kCUDA, &generic_nms);
12
+ }
13
+
14
+ REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
torch-ext/torch_binding.h ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <torch/torch.h>
4
+
5
+ std::vector<torch::Tensor> connected_components_labeling_2d(const torch::Tensor &inputs, bool get_counts);
6
+ torch::Tensor generic_nms(const torch::Tensor &dets, const torch::Tensor &scores, double iou_threshold, bool use_iou_matrix);