ChuxiJ commited on
Commit
12bc51a
·
1 Parent(s): 7534053

max_model_len 8192 -> 4096

Browse files
acestep/llm_inference.py CHANGED
@@ -375,9 +375,9 @@ class LLMHandler:
375
  max_ratio=0.9
376
  )
377
  if low_gpu_memory_mode:
378
- self.max_model_len = 4096
379
  else:
380
- self.max_model_len = 8192
381
 
382
  logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
383
  start_time = time.time()
 
375
  max_ratio=0.9
376
  )
377
  if low_gpu_memory_mode:
378
+ self.max_model_len = 2048
379
  else:
380
+ self.max_model_len = 4096
381
 
382
  logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
383
  start_time = time.time()
acestep/third_parts/nano-vllm/nanovllm/config.py CHANGED
@@ -8,7 +8,7 @@ class Config:
8
  model: str
9
  max_num_batched_tokens: int = 16384
10
  max_num_seqs: int = 512
11
- max_model_len: int = 8192
12
  gpu_memory_utilization: float = 0.9
13
  tensor_parallel_size: int = 1
14
  enforce_eager: bool = False
 
8
  model: str
9
  max_num_batched_tokens: int = 16384
10
  max_num_seqs: int = 512
11
+ max_model_len: int = 4096
12
  gpu_memory_utilization: float = 0.9
13
  tensor_parallel_size: int = 1
14
  enforce_eager: bool = False