mknolan commited on
Commit
b1d4df2
·
verified ·
1 Parent(s): f7cf794

Add GPU diagnostic script

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GPU Diagnostics Tool for Hugging Face Spaces
4
+ This script performs a comprehensive check of GPU availability and functionality.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import subprocess
10
+ import time
11
+ import json
12
+
13
+ print("=" * 80)
14
+ print("GPU DIAGNOSTICS TOOL")
15
+ print("=" * 80)
16
+
17
+ # Check Python version
18
+ print(f"Python version: {sys.version}")
19
+ print("-" * 80)
20
+
21
+ # Check environment variables
22
+ print("ENVIRONMENT VARIABLES:")
23
+ gpu_related_vars = [
24
+ "CUDA_VISIBLE_DEVICES",
25
+ "NVIDIA_VISIBLE_DEVICES",
26
+ "PYTORCH_CUDA_ALLOC_CONF",
27
+ "HF_HOME"
28
+ ]
29
+
30
+ for var in gpu_related_vars:
31
+ print(f"{var}: {os.environ.get(var, 'Not set')}")
32
+ print("-" * 80)
33
+
34
+ # Check for nvidia-smi
35
+ print("CHECKING FOR NVIDIA-SMI:")
36
+ try:
37
+ result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
38
+ if result.returncode == 0:
39
+ print("nvidia-smi is available and working!")
40
+ print(result.stdout)
41
+ else:
42
+ print("nvidia-smi error:")
43
+ print(result.stderr)
44
+ except Exception as e:
45
+ print(f"Error running nvidia-smi: {str(e)}")
46
+ print("-" * 80)
47
+
48
+ # Check PyTorch and CUDA
49
+ print("CHECKING PYTORCH AND CUDA:")
50
+ try:
51
+ import torch
52
+
53
+ print(f"PyTorch version: {torch.__version__}")
54
+ print(f"CUDA available: {torch.cuda.is_available()}")
55
+ print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")
56
+
57
+ if torch.cuda.is_available():
58
+ print(f"CUDA device count: {torch.cuda.device_count()}")
59
+ for i in range(torch.cuda.device_count()):
60
+ print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")
61
+ print(f"Current CUDA device: {torch.cuda.current_device()}")
62
+
63
+ # Try to create and operate on a CUDA tensor
64
+ print("\nTesting CUDA tensor creation:")
65
+ try:
66
+ start_time = time.time()
67
+ x = torch.rand(1000, 1000, device="cuda" if torch.cuda.is_available() else "cpu")
68
+ y = x @ x # Matrix multiplication to test computation
69
+ torch.cuda.synchronize() # Wait for the operation to complete
70
+ end_time = time.time()
71
+
72
+ if torch.cuda.is_available():
73
+ print(f"Successfully created and operated on a CUDA tensor in {end_time - start_time:.4f} seconds")
74
+ else:
75
+ print(f"Created and operated on a CPU tensor in {end_time - start_time:.4f} seconds (CUDA not available)")
76
+ except Exception as e:
77
+ print(f"Error in tensor creation/operation: {str(e)}")
78
+
79
+ # Try to get more detailed CUDA info
80
+ if torch.cuda.is_available():
81
+ print("\nDetailed CUDA information:")
82
+ print(f"CUDA capability: {torch.cuda.get_device_capability(0)}")
83
+ print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
84
+ print(f"CUDA arch list: {torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else 'Not available'}")
85
+ except ImportError:
86
+ print("PyTorch is not installed")
87
+ print("-" * 80)
88
+
89
+ # Create a simple GPU test with a web interface
90
+ print("CREATING SIMPLE GPU TEST WEB INTERFACE...")
91
+ try:
92
+ import gradio as gr
93
+
94
+ def check_gpu():
95
+ results = {
96
+ "python_version": sys.version,
97
+ "environment_vars": {var: os.environ.get(var, "Not set") for var in gpu_related_vars},
98
+ "torch_available": False,
99
+ "cuda_available": False
100
+ }
101
+
102
+ try:
103
+ import torch
104
+ results["torch_available"] = True
105
+ results["torch_version"] = torch.__version__
106
+ results["cuda_available"] = torch.cuda.is_available()
107
+
108
+ if torch.cuda.is_available():
109
+ results["cuda_version"] = torch.version.cuda
110
+ results["cuda_device_count"] = torch.cuda.device_count()
111
+ results["cuda_device_name"] = torch.cuda.get_device_name(0)
112
+
113
+ # Test tensor creation
114
+ start_time = time.time()
115
+ x = torch.rand(1000, 1000, device="cuda")
116
+ y = x @ x
117
+ torch.cuda.synchronize()
118
+ end_time = time.time()
119
+ results["tensor_test_time"] = f"{end_time - start_time:.4f} seconds"
120
+ results["gpu_test_passed"] = True
121
+ else:
122
+ results["gpu_test_passed"] = False
123
+ except Exception as e:
124
+ results["error"] = str(e)
125
+ results["gpu_test_passed"] = False
126
+
127
+ return json.dumps(results, indent=2)
128
+
129
+ demo = gr.Interface(
130
+ fn=check_gpu,
131
+ inputs=[],
132
+ outputs="text",
133
+ title="GPU Diagnostics",
134
+ description="Click the button to run GPU diagnostics"
135
+ )
136
+
137
+ print("Starting Gradio web interface on port 7860...")
138
+ demo.launch(server_name="0.0.0.0")
139
+ except ImportError:
140
+ print("Gradio not installed, skipping web interface")
141
+ print("Raw GPU diagnostics complete.")
142
+ print("-" * 80)