{ "title": "Edge Inference Benchmarks", "subtitle": "Compare throughput and latency across devices and model variants.", "model_column": "model", "model_family_column": "model_family", "model_link_prefix": "https://huggingface.co/", "optimized_org": "embedl", "filters": [ { "column": "type", "label": "MODALITY" }, { "column": "batch", "label": "BATCH SIZE", "type": "number" }, { "column": "device", "label": "DEVICE", "value_labels": { "orin_nano": "Jetson Orin Nano Super", "orin_nano_super": "Jetson Orin Nano Super", "agx_orin": "Jetson AGX Orin", "agx_thor": "Jetson AGX Thor" } } ], "metrics": [ { "column": "tps", "label": "Tokens / sec", "short": "TPS ↑", "higher_is_better": true, "description": "Tokens per second (higher is better). Number of output tokens generated per second during the decoding phase. " }, { "column": "tpot", "label": "Time per Output Token (ms)", "short": "TPOT(ms) ↓", "higher_is_better": false, "description": "Time per output token in ms (lower is better). Average time (in milliseconds) required to generate one output token during decoding. Computed as TPOT = (last_token_ts - first_token_ts) / total_output_tokens." }, { "column": "ttft", "label": "Time to First Token (ms)", "short": "TTFT(ms) ↓", "higher_is_better": false, "description": "Time to first token in ms (lower is better). Time from request submission to generation of the first output token. This includes vision encoding, prompt prefill, KV cache initialization." }, { "column": "e2e", "label": "End-to-End Latency (sec)", "short": "E2E(s) ↓", "higher_is_better": false, "description": "End-to-end latency in seconds (lower is better). Total time from request submission to completion of the full generated response. This reflects real user-perceived latency." } ], "display_columns": [ { "column": "res", "label": "RESOLUTION", "visible_when": { "type": [ "video", "image" ] } }, { "column": "fps", "label": "FPS", "type": "number", "visible_when": { "type": [ "video" ] } }, { "column": "frames", "label": "Frames", "type": "number", "visible_when": { "type": [ "video" ] } } ], "chart": { "default_metric": "tps", "group_by": "device", "scenarios": [ { "label": "Text", "match": { "type": "text" } }, { "label": "Image · 1280×720", "match": { "type": "image", "res": "1280x720" } }, { "label": "Video · 1280×720 · 4 FPS", "match": { "type": "video", "res": "1280x720", "fps": 4 } } ] }, "table_sort": [ { "column": "res", "direction": "asc" }, { "column": "fps", "direction": "desc" } ], "table_group_by": "model", "model_families": { "Cosmos-Reason2-2B": { "data_file": "data/Cosmos-Reason2.csv", "table_group_by": [ "res", "fps" ], "experiment_setup": { "agx_thor": "Measurement setup: NVIDIA vLLM 26.01, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.", "agx_orin": "Measurement setup: NVIDIA AI IoT vLLM 0.14.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.", "orin_nano": "Measurement setup: NVIDIA AI IoT vLLM 0.14.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs." }, "default_device": "orin_nano", "accuracy_file": "data/acc-Cosmos-Reason2.csv", "accuracy_title": "Physical AI Bench (PAI-Bench)", "accuracy_url": "https://huggingface.co/spaces/shi-labs/physical-ai-bench-leaderboard", "demo": { "title": "Output Examples", "benchmark_match": { "type": "video", "batch": "1", "res": "1280x720", "fps": "4" }, "clips": [ { "video": "https://nvidia-cosmos.github.io/cosmos-cookbook/gallery/vs_assets/clip_1_short.mp4", "label": "Automotive Description", "prompt": "Describe this video in detail.", "inference_setup": "1280x720 · 12 Frames", "citation": "Video source: Wagwani, R., Matveiev, N., Bapst, J., & Gu, J. (2025). Autonomous Vehicle Domain Adaptation Gallery. In NVIDIA Cosmos Cookbook.", "citation_url": "https://nvidia-cosmos.github.io/cosmos-cookbook/gallery/av_inference.html", "outputs": [ { "model": "nvidia/Cosmos-Reason2-2B", "text": "The video captures a scenic highway winding through a rural landscape, with multiple lanes of traffic flowing smoothly. Vehicles of various colors and models are seen traveling in both directions along the road. On the left side of the highway, a white pickup truck is followed by a red sedan, while on the right side, a white sedan is seen driving away from the camera. Further down the road, additional cars can be observed, including a blue hatchback and a red SUV, all maintaining a steady pace. The highway curves gently to the right, bordered by patches of dry grass and sparse trees, suggesting it might be late autumn or early spring. The sky above is clear and bright, indicating favorable weather conditions. On the right side of the highway, there is a pedestrian crossing marked by a yellow and black striped barrier, ensuring safety for walkers. In the distance, a lone figure can be seen walking along the sidewalk parallel to the road, adding a sense of scale and tranquility to the scene. The overall atmosphere is calm and orderly, with no signs of congestion or accidents." }, { "model": "embedl/Cosmos-Reason2-2B-W4A16-Edge2-FlashHead", "text": "The video captures a dynamic scene of a multi-laned highway winding through a wooded area. The road is divided into two lanes in each direction, separated by a median strip. Several vehicles are traveling along the highway, including a white sedan in the rightmost lane, a red hatchback in the leftmost lane, and a white pickup truck further ahead in the same lane as the hatchback. Additional cars can be seen further down the road, moving in both directions. The surrounding landscape features bare trees and patches of grass, indicative of a winter or early spring season. The road markings are clearly visible, with dashed white lines marking the edges of the lanes and solid yellow lines delineating the lanes. On the right side of the road, there is a warning sign with black and yellow stripes, indicating caution for drivers. The sky is clear, suggesting favorable weather conditions for driving." } ] }, { "video": "https://nvidia-cosmos.github.io/cosmos-cookbook/gallery/assets/kitchen_oranges_input.mp4", "label": "Instruction Understanding", "prompt": "The robot in the video was given the instruction - store all fruit in the big bowl. Has it successfully completed the task specified in the instruction? Explain your answer in detail.", "inference_setup": "1280x720 · 32 Frames", "citation": "Video source: Wagwani, R., Matveiev, N., Bapst, J., & Gu, J. (2025). Autonomous Vehicle Domain Adaptation Gallery. In NVIDIA Cosmos Cookbook.", "citation_url": "https://nvidia-cosmos.github.io/cosmos-cookbook/gallery/av_inference.html", "outputs": [ { "model": "nvidia/Cosmos-Reason2-2B", "text": "No, the robot has not successfully completed the task. The instruction was to store all fruit in the big bowl, but the robot only moved one fruit into the bowl. There are still other fruits on the countertop that have not been moved into the bowl. Additionally, the robot's actions appear somewhat clumsy and inefficient." }, { "model": "embedl/Cosmos-Reason2-2B-W4A16-Edge2-FlashHead", "text": "No, the robot has not successfully completed the task. The instruction was to store all fruit in the big bowl, but the video shows that there are still other fruits on the plate and counter. Additionally, the robot's actions involve moving objects around rather than storing them in the bowl." } ] }, { "video": "https://huggingface.co/datasets/embedl/documentation-images/resolve/main/Edge-Inference-Benchmarks/aerial_drone_view.mp4", "label": "Search and Rescue", "prompt": "Mission: Inspect the drone video for any people or animals that clearly need rescue. Answer format: 1. Rescue needed: Y/N; 2. Description: Describe the scene in detail, explain why rescue is or is not needed.", "inference_setup": "1280x720 · 24 Frames", "outputs": [ { "model": "nvidia/Cosmos-Reason2-2B", "text": "1. Rescue needed: N 2. The scene shows a group of people gathered around a table on a grassy field near a road, with a vehicle parked nearby. Individuals appear to be engaged in an activity, possibly a meeting or briefing, with some standing and others seated. There are no visible signs of distress, emergencies, or animals requiring rescue. The environment is calm, and the group seems to be cooperating peacefully." }, { "model": "embedl/Cosmos-Reason2-2B-W4A16-Edge2-FlashHead", "text": "1. Rescue needed: N; 2. The video shows an aerial view of a serene landscape with a lake, green fields, and a road with a few vehicles and people. There are no visible emergencies, accidents, or animals in distress. The scene appears peaceful and undisturbed." } ] } ] } }, "Qwen3.5": { "data_file": "data/Qwen3.5.csv", "table_group_by": [ "res", "fps" ], "experiment_setup": { "agx_thor": "Measurement setup: NVIDIA AI IoT vLLM 0.16.0 arm64, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.", "agx_orin": "Measurement setup: NVIDIA AI IoT vLLM 0.16.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.", "orin_nano": "Measurement setup: NVIDIA AI IoT vLLM 0.16.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs." }, "default_device": "agx_orin" } }, "accuracy_title": "Accuracy" }