Spaces:
Running
on
Zero
Running
on
Zero
max_model_len 8192 -> 4096
Browse files
acestep/llm_inference.py
CHANGED
|
@@ -375,9 +375,9 @@ class LLMHandler:
|
|
| 375 |
max_ratio=0.9
|
| 376 |
)
|
| 377 |
if low_gpu_memory_mode:
|
| 378 |
-
self.max_model_len =
|
| 379 |
else:
|
| 380 |
-
self.max_model_len =
|
| 381 |
|
| 382 |
logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
|
| 383 |
start_time = time.time()
|
|
|
|
| 375 |
max_ratio=0.9
|
| 376 |
)
|
| 377 |
if low_gpu_memory_mode:
|
| 378 |
+
self.max_model_len = 2048
|
| 379 |
else:
|
| 380 |
+
self.max_model_len = 4096
|
| 381 |
|
| 382 |
logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
|
| 383 |
start_time = time.time()
|
acestep/third_parts/nano-vllm/nanovllm/config.py
CHANGED
|
@@ -8,7 +8,7 @@ class Config:
|
|
| 8 |
model: str
|
| 9 |
max_num_batched_tokens: int = 16384
|
| 10 |
max_num_seqs: int = 512
|
| 11 |
-
max_model_len: int =
|
| 12 |
gpu_memory_utilization: float = 0.9
|
| 13 |
tensor_parallel_size: int = 1
|
| 14 |
enforce_eager: bool = False
|
|
|
|
| 8 |
model: str
|
| 9 |
max_num_batched_tokens: int = 16384
|
| 10 |
max_num_seqs: int = 512
|
| 11 |
+
max_model_len: int = 4096
|
| 12 |
gpu_memory_utilization: float = 0.9
|
| 13 |
tensor_parallel_size: int = 1
|
| 14 |
enforce_eager: bool = False
|