Text Generation
Transformers
Safetensors
llama
research
code
mathematics
reasoning
multilingual
long-context
custom_code
text-generation-inference
Instructions to use DeepXR/Helion-V2.5-Rnd with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use DeepXR/Helion-V2.5-Rnd with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use DeepXR/Helion-V2.5-Rnd with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "DeepXR/Helion-V2.5-Rnd" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V2.5-Rnd", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
- SGLang
How to use DeepXR/Helion-V2.5-Rnd with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-V2.5-Rnd" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V2.5-Rnd", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-V2.5-Rnd" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V2.5-Rnd", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use DeepXR/Helion-V2.5-Rnd with Docker Model Runner:
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
| #!/usr/bin/env python3 | |
| """ | |
| Helion-2.5-Rnd Python Client | |
| Easy-to-use client for interacting with Helion inference server | |
| """ | |
| import json | |
| import requests | |
| from typing import Dict, Generator, List, Optional, Union | |
| class HelionClient: | |
| """Client for Helion-2.5-Rnd inference API""" | |
| def __init__( | |
| self, | |
| base_url: str = "http://localhost:8000", | |
| api_key: Optional[str] = None, | |
| timeout: int = 300 | |
| ): | |
| """ | |
| Initialize Helion client | |
| Args: | |
| base_url: Base URL of the inference server | |
| api_key: Optional API key for authentication | |
| timeout: Request timeout in seconds | |
| """ | |
| self.base_url = base_url.rstrip('/') | |
| self.timeout = timeout | |
| self.headers = { | |
| "Content-Type": "application/json" | |
| } | |
| if api_key: | |
| self.headers["Authorization"] = f"Bearer {api_key}" | |
| def chat( | |
| self, | |
| messages: List[Dict[str, str]], | |
| temperature: float = 0.7, | |
| max_tokens: int = 4096, | |
| stream: bool = False, | |
| **kwargs | |
| ) -> Union[str, Generator[str, None, None]]: | |
| """ | |
| Send a chat completion request | |
| Args: | |
| messages: List of message dicts with 'role' and 'content' | |
| temperature: Sampling temperature (0.0 to 2.0) | |
| max_tokens: Maximum tokens to generate | |
| stream: Whether to stream the response | |
| **kwargs: Additional parameters | |
| Returns: | |
| Generated text or generator for streaming | |
| """ | |
| payload = { | |
| "messages": messages, | |
| "temperature": temperature, | |
| "max_tokens": max_tokens, | |
| "stream": stream, | |
| **kwargs | |
| } | |
| if stream: | |
| return self._stream_chat(payload) | |
| else: | |
| return self._complete_chat(payload) | |
| def _complete_chat(self, payload: Dict) -> str: | |
| """Non-streaming chat completion""" | |
| response = requests.post( | |
| f"{self.base_url}/v1/chat/completions", | |
| headers=self.headers, | |
| json=payload, | |
| timeout=self.timeout | |
| ) | |
| response.raise_for_status() | |
| data = response.json() | |
| return data["choices"][0]["message"]["content"] | |
| def _stream_chat(self, payload: Dict) -> Generator[str, None, None]: | |
| """Streaming chat completion""" | |
| response = requests.post( | |
| f"{self.base_url}/v1/chat/completions", | |
| headers=self.headers, | |
| json=payload, | |
| stream=True, | |
| timeout=self.timeout | |
| ) | |
| response.raise_for_status() | |
| for line in response.iter_lines(): | |
| if line: | |
| line = line.decode('utf-8') | |
| if line.startswith('data: '): | |
| data_str = line[6:] | |
| if data_str == '[DONE]': | |
| break | |
| try: | |
| data = json.loads(data_str) | |
| delta = data["choices"][0]["delta"].get("content", "") | |
| if delta: | |
| yield delta | |
| except json.JSONDecodeError: | |
| continue | |
| def complete( | |
| self, | |
| prompt: str, | |
| temperature: float = 0.7, | |
| max_tokens: int = 4096, | |
| stream: bool = False, | |
| **kwargs | |
| ) -> Union[str, Generator[str, None, None]]: | |
| """ | |
| Send a text completion request | |
| Args: | |
| prompt: Input text prompt | |
| temperature: Sampling temperature | |
| max_tokens: Maximum tokens to generate | |
| stream: Whether to stream the response | |
| **kwargs: Additional parameters | |
| Returns: | |
| Generated text or generator for streaming | |
| """ | |
| messages = [{"role": "user", "content": prompt}] | |
| return self.chat( | |
| messages=messages, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| stream=stream, | |
| **kwargs | |
| ) | |
| def health_check(self) -> Dict: | |
| """Check server health""" | |
| response = requests.get( | |
| f"{self.base_url}/health", | |
| headers=self.headers, | |
| timeout=10 | |
| ) | |
| response.raise_for_status() | |
| return response.json() | |
| def list_models(self) -> List[Dict]: | |
| """List available models""" | |
| response = requests.get( | |
| f"{self.base_url}/v1/models", | |
| headers=self.headers, | |
| timeout=10 | |
| ) | |
| response.raise_for_status() | |
| return response.json()["data"] | |
| class HelionAssistant: | |
| """High-level assistant interface for Helion""" | |
| def __init__( | |
| self, | |
| base_url: str = "http://localhost:8000", | |
| system_prompt: Optional[str] = None, | |
| **client_kwargs | |
| ): | |
| """ | |
| Initialize Helion assistant | |
| Args: | |
| base_url: Base URL of inference server | |
| system_prompt: System prompt to use for all conversations | |
| **client_kwargs: Additional arguments for HelionClient | |
| """ | |
| self.client = HelionClient(base_url=base_url, **client_kwargs) | |
| self.system_prompt = system_prompt or ( | |
| "You are Helion, an advanced AI assistant developed by DeepXR. " | |
| "You are helpful, harmless, and honest." | |
| ) | |
| self.conversation_history: List[Dict[str, str]] = [] | |
| def chat( | |
| self, | |
| message: str, | |
| temperature: float = 0.7, | |
| max_tokens: int = 4096, | |
| stream: bool = False, | |
| reset_history: bool = False | |
| ) -> Union[str, Generator[str, None, None]]: | |
| """ | |
| Chat with the assistant | |
| Args: | |
| message: User message | |
| temperature: Sampling temperature | |
| max_tokens: Maximum tokens to generate | |
| stream: Whether to stream the response | |
| reset_history: Whether to reset conversation history | |
| Returns: | |
| Assistant response | |
| """ | |
| if reset_history: | |
| self.conversation_history = [] | |
| # Build messages | |
| messages = [{"role": "system", "content": self.system_prompt}] | |
| messages.extend(self.conversation_history) | |
| messages.append({"role": "user", "content": message}) | |
| # Get response | |
| if stream: | |
| return self._stream_and_store(messages, temperature, max_tokens, message) | |
| else: | |
| response = self.client.chat( | |
| messages=messages, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| stream=False | |
| ) | |
| # Update history | |
| self.conversation_history.append({"role": "user", "content": message}) | |
| self.conversation_history.append({"role": "assistant", "content": response}) | |
| return response | |
| def _stream_and_store( | |
| self, | |
| messages: List[Dict], | |
| temperature: float, | |
| max_tokens: int, | |
| user_message: str | |
| ) -> Generator[str, None, None]: | |
| """Stream response and store in history""" | |
| full_response = "" | |
| for chunk in self.client.chat( | |
| messages=messages, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| stream=True | |
| ): | |
| full_response += chunk | |
| yield chunk | |
| # Update history after streaming complete | |
| self.conversation_history.append({"role": "user", "content": user_message}) | |
| self.conversation_history.append({"role": "assistant", "content": full_response}) | |
| def reset(self): | |
| """Reset conversation history""" | |
| self.conversation_history = [] | |
| def get_history(self) -> List[Dict[str, str]]: | |
| """Get conversation history""" | |
| return self.conversation_history.copy() | |
| # Example usage | |
| def example_usage(): | |
| """Example usage of Helion client""" | |
| # Initialize client | |
| client = HelionClient(base_url="http://localhost:8000") | |
| # Check health | |
| health = client.health_check() | |
| print(f"Server status: {health['status']}") | |
| # Simple completion | |
| response = client.complete( | |
| "Explain quantum computing in simple terms:", | |
| temperature=0.7, | |
| max_tokens=500 | |
| ) | |
| print(f"\nResponse: {response}") | |
| # Chat with conversation | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful coding assistant."}, | |
| {"role": "user", "content": "Write a Python function to calculate fibonacci numbers"} | |
| ] | |
| response = client.chat(messages=messages, temperature=0.3) | |
| print(f"\nCode: {response}") | |
| # Streaming example | |
| print("\nStreaming response:") | |
| for chunk in client.complete("Tell me a short story about AI:", stream=True): | |
| print(chunk, end='', flush=True) | |
| print() | |
| # Using assistant interface | |
| assistant = HelionAssistant() | |
| response = assistant.chat("What is machine learning?") | |
| print(f"\nAssistant: {response}") | |
| # Continue conversation | |
| response = assistant.chat("Can you give me an example?") | |
| print(f"\nAssistant: {response}") | |
| if __name__ == "__main__": | |
| example_usage() |