Spaces:

mknolan
/

gpu-diagnostic-test

Configuration error

App Files Files Community

mknolan commited on Mar 15

Commit

b1d4df2

verified ·

1 Parent(s): f7cf794

Add GPU diagnostic script

Browse files

Files changed (1) hide show

app.py +142 -0

app.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#!/usr/bin/env python3
+"""
+GPU Diagnostics Tool for Hugging Face Spaces
+This script performs a comprehensive check of GPU availability and functionality.
+"""
+import os
+import sys
+import subprocess
+import time
+import json
+print("=" * 80)
+print("GPU DIAGNOSTICS TOOL")
+print("=" * 80)
+# Check Python version
+print(f"Python version: {sys.version}")
+print("-" * 80)
+# Check environment variables
+print("ENVIRONMENT VARIABLES:")
+gpu_related_vars = [
+    "CUDA_VISIBLE_DEVICES",
+    "NVIDIA_VISIBLE_DEVICES",
+    "PYTORCH_CUDA_ALLOC_CONF",
+    "HF_HOME"
+]
+for var in gpu_related_vars:
+    print(f"{var}: {os.environ.get(var, 'Not set')}")
+print("-" * 80)
+# Check for nvidia-smi
+print("CHECKING FOR NVIDIA-SMI:")
+try:
+    result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if result.returncode == 0:
+        print("nvidia-smi is available and working!")
+        print(result.stdout)
+    else:
+        print("nvidia-smi error:")
+        print(result.stderr)
+except Exception as e:
+    print(f"Error running nvidia-smi: {str(e)}")
+print("-" * 80)
+# Check PyTorch and CUDA
+print("CHECKING PYTORCH AND CUDA:")
+try:
+    import torch
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")
+    if torch.cuda.is_available():
+        print(f"CUDA device count: {torch.cuda.device_count()}")
+        for i in range(torch.cuda.device_count()):
+            print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")
+        print(f"Current CUDA device: {torch.cuda.current_device()}")
+    # Try to create and operate on a CUDA tensor
+    print("\nTesting CUDA tensor creation:")
+    try:
+        start_time = time.time()
+        x = torch.rand(1000, 1000, device="cuda" if torch.cuda.is_available() else "cpu")
+        y = x @ x  # Matrix multiplication to test computation
+        torch.cuda.synchronize()  # Wait for the operation to complete
+        end_time = time.time()
+        if torch.cuda.is_available():
+            print(f"Successfully created and operated on a CUDA tensor in {end_time - start_time:.4f} seconds")
+        else:
+            print(f"Created and operated on a CPU tensor in {end_time - start_time:.4f} seconds (CUDA not available)")
+    except Exception as e:
+        print(f"Error in tensor creation/operation: {str(e)}")
+    # Try to get more detailed CUDA info
+    if torch.cuda.is_available():
+        print("\nDetailed CUDA information:")
+        print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
+        print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
+        print(f"CUDA arch list: {torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else 'Not available'}")
+except ImportError:
+    print("PyTorch is not installed")
+print("-" * 80)
+# Create a simple GPU test with a web interface
+print("CREATING SIMPLE GPU TEST WEB INTERFACE...")
+try:
+    import gradio as gr
+    def check_gpu():
+        results = {
+            "python_version": sys.version,
+            "environment_vars": {var: os.environ.get(var, "Not set") for var in gpu_related_vars},
+            "torch_available": False,
+            "cuda_available": False
+        }
+        try:
+            import torch
+            results["torch_available"] = True
+            results["torch_version"] = torch.__version__
+            results["cuda_available"] = torch.cuda.is_available()
+            if torch.cuda.is_available():
+                results["cuda_version"] = torch.version.cuda
+                results["cuda_device_count"] = torch.cuda.device_count()
+                results["cuda_device_name"] = torch.cuda.get_device_name(0)
+                # Test tensor creation
+                start_time = time.time()
+                x = torch.rand(1000, 1000, device="cuda")
+                y = x @ x
+                torch.cuda.synchronize()
+                end_time = time.time()
+                results["tensor_test_time"] = f"{end_time - start_time:.4f} seconds"
+                results["gpu_test_passed"] = True
+            else:
+                results["gpu_test_passed"] = False
+        except Exception as e:
+            results["error"] = str(e)
+            results["gpu_test_passed"] = False
+        return json.dumps(results, indent=2)
+    demo = gr.Interface(
+        fn=check_gpu,
+        inputs=[],
+        outputs="text",
+        title="GPU Diagnostics",
+        description="Click the button to run GPU diagnostics"
+    )
+    print("Starting Gradio web interface on port 7860...")
+    demo.launch(server_name="0.0.0.0")
+except ImportError:
+    print("Gradio not installed, skipping web interface")
+    print("Raw GPU diagnostics complete.")
+print("-" * 80)