Instructions to use DeepXR/Helion-V2.5-Rnd with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use DeepXR/Helion-V2.5-Rnd with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use DeepXR/Helion-V2.5-Rnd with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "DeepXR/Helion-V2.5-Rnd"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/DeepXR/Helion-V2.5-Rnd

SGLang

How to use DeepXR/Helion-V2.5-Rnd with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "DeepXR/Helion-V2.5-Rnd" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "DeepXR/Helion-V2.5-Rnd" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use DeepXR/Helion-V2.5-Rnd with Docker Model Runner:
```
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
```

Helion-V2.5-Rnd / inference /client.py

Trouter-Library

Create inference/client.py

ef0c6e1 verified 6 months ago

raw

history blame contribute delete

9.47 kB

	#!/usr/bin/env python3
	"""
	Helion-2.5-Rnd Python Client
	Easy-to-use client for interacting with Helion inference server
	"""

	import json
	import requests
	from typing import Dict, Generator, List, Optional, Union


	class HelionClient:
	"""Client for Helion-2.5-Rnd inference API"""

	def __init__(
	self,
	base_url: str = "http://localhost:8000",
	api_key: Optional[str] = None,
	timeout: int = 300
	):
	"""
	Initialize Helion client

	Args:
	base_url: Base URL of the inference server
	api_key: Optional API key for authentication
	timeout: Request timeout in seconds
	"""
	self.base_url = base_url.rstrip('/')
	self.timeout = timeout
	self.headers = {
	"Content-Type": "application/json"
	}
	if api_key:
	self.headers["Authorization"] = f"Bearer {api_key}"

	def chat(
	self,
	messages: List[Dict[str, str]],
	temperature: float = 0.7,
	max_tokens: int = 4096,
	stream: bool = False,
	**kwargs
	) -> Union[str, Generator[str, None, None]]:
	"""
	Send a chat completion request

	Args:
	messages: List of message dicts with 'role' and 'content'
	temperature: Sampling temperature (0.0 to 2.0)
	max_tokens: Maximum tokens to generate
	stream: Whether to stream the response
	**kwargs: Additional parameters

	Returns:
	Generated text or generator for streaming
	"""
	payload = {
	"messages": messages,
	"temperature": temperature,
	"max_tokens": max_tokens,
	"stream": stream,
	**kwargs
	}

	if stream:
	return self._stream_chat(payload)
	else:
	return self._complete_chat(payload)

	def _complete_chat(self, payload: Dict) -> str:
	"""Non-streaming chat completion"""
	response = requests.post(
	f"{self.base_url}/v1/chat/completions",
	headers=self.headers,
	json=payload,
	timeout=self.timeout
	)
	response.raise_for_status()

	data = response.json()
	return data["choices"][0]["message"]["content"]

	def _stream_chat(self, payload: Dict) -> Generator[str, None, None]:
	"""Streaming chat completion"""
	response = requests.post(
	f"{self.base_url}/v1/chat/completions",
	headers=self.headers,
	json=payload,
	stream=True,
	timeout=self.timeout
	)
	response.raise_for_status()

	for line in response.iter_lines():
	if line:
	line = line.decode('utf-8')
	if line.startswith('data: '):
	data_str = line[6:]
	if data_str == '[DONE]':
	break

	try:
	data = json.loads(data_str)
	delta = data["choices"][0]["delta"].get("content", "")
	if delta:
	yield delta
	except json.JSONDecodeError:
	continue

	def complete(
	self,
	prompt: str,
	temperature: float = 0.7,
	max_tokens: int = 4096,
	stream: bool = False,
	**kwargs
	) -> Union[str, Generator[str, None, None]]:
	"""
	Send a text completion request

	Args:
	prompt: Input text prompt
	temperature: Sampling temperature
	max_tokens: Maximum tokens to generate
	stream: Whether to stream the response
	**kwargs: Additional parameters

	Returns:
	Generated text or generator for streaming
	"""
	messages = [{"role": "user", "content": prompt}]
	return self.chat(
	messages=messages,
	temperature=temperature,
	max_tokens=max_tokens,
	stream=stream,
	**kwargs
	)

	def health_check(self) -> Dict:
	"""Check server health"""
	response = requests.get(
	f"{self.base_url}/health",
	headers=self.headers,
	timeout=10
	)
	response.raise_for_status()
	return response.json()

	def list_models(self) -> List[Dict]:
	"""List available models"""
	response = requests.get(
	f"{self.base_url}/v1/models",
	headers=self.headers,
	timeout=10
	)
	response.raise_for_status()
	return response.json()["data"]


	class HelionAssistant:
	"""High-level assistant interface for Helion"""

	def __init__(
	self,
	base_url: str = "http://localhost:8000",
	system_prompt: Optional[str] = None,
	**client_kwargs
	):
	"""
	Initialize Helion assistant

	Args:
	base_url: Base URL of inference server
	system_prompt: System prompt to use for all conversations
	**client_kwargs: Additional arguments for HelionClient
	"""
	self.client = HelionClient(base_url=base_url, **client_kwargs)
	self.system_prompt = system_prompt or (
	"You are Helion, an advanced AI assistant developed by DeepXR. "
	"You are helpful, harmless, and honest."
	)
	self.conversation_history: List[Dict[str, str]] = []

	def chat(
	self,
	message: str,
	temperature: float = 0.7,
	max_tokens: int = 4096,
	stream: bool = False,
	reset_history: bool = False
	) -> Union[str, Generator[str, None, None]]:
	"""
	Chat with the assistant

	Args:
	message: User message
	temperature: Sampling temperature
	max_tokens: Maximum tokens to generate
	stream: Whether to stream the response
	reset_history: Whether to reset conversation history

	Returns:
	Assistant response
	"""
	if reset_history:
	self.conversation_history = []

	# Build messages
	messages = [{"role": "system", "content": self.system_prompt}]
	messages.extend(self.conversation_history)
	messages.append({"role": "user", "content": message})

	# Get response
	if stream:
	return self._stream_and_store(messages, temperature, max_tokens, message)
	else:
	response = self.client.chat(
	messages=messages,
	temperature=temperature,
	max_tokens=max_tokens,
	stream=False
	)

	# Update history
	self.conversation_history.append({"role": "user", "content": message})
	self.conversation_history.append({"role": "assistant", "content": response})

	return response

	def _stream_and_store(
	self,
	messages: List[Dict],
	temperature: float,
	max_tokens: int,
	user_message: str
	) -> Generator[str, None, None]:
	"""Stream response and store in history"""
	full_response = ""

	for chunk in self.client.chat(
	messages=messages,
	temperature=temperature,
	max_tokens=max_tokens,
	stream=True
	):
	full_response += chunk
	yield chunk

	# Update history after streaming complete
	self.conversation_history.append({"role": "user", "content": user_message})
	self.conversation_history.append({"role": "assistant", "content": full_response})

	def reset(self):
	"""Reset conversation history"""
	self.conversation_history = []

	def get_history(self) -> List[Dict[str, str]]:
	"""Get conversation history"""
	return self.conversation_history.copy()


	# Example usage
	def example_usage():
	"""Example usage of Helion client"""

	# Initialize client
	client = HelionClient(base_url="http://localhost:8000")

	# Check health
	health = client.health_check()
	print(f"Server status: {health['status']}")

	# Simple completion
	response = client.complete(
	"Explain quantum computing in simple terms:",
	temperature=0.7,
	max_tokens=500
	)
	print(f"\nResponse: {response}")

	# Chat with conversation
	messages = [
	{"role": "system", "content": "You are a helpful coding assistant."},
	{"role": "user", "content": "Write a Python function to calculate fibonacci numbers"}
	]

	response = client.chat(messages=messages, temperature=0.3)
	print(f"\nCode: {response}")

	# Streaming example
	print("\nStreaming response:")
	for chunk in client.complete("Tell me a short story about AI:", stream=True):
	print(chunk, end='', flush=True)
	print()

	# Using assistant interface
	assistant = HelionAssistant()
	response = assistant.chat("What is machine learning?")
	print(f"\nAssistant: {response}")

	# Continue conversation
	response = assistant.chat("Can you give me an example?")
	print(f"\nAssistant: {response}")


	if __name__ == "__main__":
	example_usage()