Spaces:
Running
Running
| import os | |
| from openai import OpenAI | |
| import re # Import regex for parsing conversation turns | |
| from typing import Optional, Union # Need Optional for settings | |
| # Ensure the OPENROUTER_API_KEY environment variable is set | |
| api_key = "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" | |
| if not api_key: | |
| raise ValueError("OPENROUTER_API_KEY environment variable not set.") | |
| # Point the OpenAI client to the OpenRouter API | |
| client = OpenAI( | |
| base_url="https://openrouter.ai/api/v1", | |
| api_key=api_key, | |
| ) | |
| # --- Core Generation Functions --- | |
| def generate_synthetic_text( | |
| prompt: str, | |
| model: str = "deepseek/deepseek-chat-v3-0324:free", | |
| system_message: str = "You are a helpful assistant generating synthetic data.", | |
| temperature: Optional[float] = 0.7, # Default temperature | |
| top_p: Optional[float] = None, # Default top_p (let API decide if None) | |
| max_tokens: Optional[int] = None # Default max_tokens (let API decide if None) | |
| ) -> str: | |
| """ | |
| Generates synthetic text using an OpenRouter model via Chat Completions, | |
| including model parameter controls. | |
| Args: | |
| prompt: The user's input prompt. | |
| model: The model ID. | |
| system_message: The system message context. | |
| temperature: Controls randomness (0.0 to 2.0). None means API default. | |
| top_p: Nucleus sampling probability. None means API default. | |
| max_tokens: Maximum number of tokens to generate. None means API default. | |
| Returns: | |
| The generated text string or an error message. | |
| """ | |
| if not api_key or api_key == "YOUR_API_KEY_HERE_OR_SET_ENV_VAR": | |
| return "Error: OPENROUTER_API_KEY not configured properly. Please set the environment variable." | |
| # Prepare parameters, only including them if they are not None | |
| params = { | |
| "model": model, | |
| "messages": [ | |
| {"role": "system", "content": system_message}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| "extra_headers": { | |
| # "HTTP-Referer": "YOUR_SITE_URL", | |
| "X-Title": "SynthGen", | |
| } | |
| } | |
| if temperature is not None: | |
| params["temperature"] = temperature | |
| if top_p is not None: | |
| params["top_p"] = top_p | |
| if max_tokens is not None: | |
| params["max_tokens"] = max_tokens | |
| try: | |
| response = client.chat.completions.create(**params) # Use dictionary unpacking | |
| if response.choices and response.choices[0].message and response.choices[0].message.content: | |
| return response.choices[0].message.content.strip() | |
| else: | |
| print(f"Warning: No content in response for model {model}. Response: {response}") | |
| return "Error: No content generated by the model." | |
| except Exception as e: | |
| print(f"Error during API call to model {model}: {e}") | |
| return f"Error during API call: {e}" | |
| def generate_prompts( | |
| num_prompts: int, | |
| model: str, | |
| topic_hint: str = "diverse and interesting", | |
| temperature: Optional[float] = 0.7, # Pass settings through | |
| top_p: Optional[float] = None, | |
| max_tokens: Optional[int] = 200 # Set a reasonable default max for prompts | |
| ) -> list[str]: | |
| """ | |
| Generates a list of conversation prompts using an AI model. | |
| Args: | |
| num_prompts: The number of prompts to generate. | |
| model: The model ID to use for generation. | |
| topic_hint: Optional hint for the kind of topics (e.g., "related to technology"). | |
| temperature: Controls randomness (0.0 to 2.0). None means API default. | |
| top_p: Nucleus sampling probability. None means API default. | |
| max_tokens: Maximum number of tokens to generate. None means API default. | |
| Returns: | |
| A list of generated prompts. | |
| """ | |
| instruction = ( | |
| f"Generate exactly {num_prompts} unique, {topic_hint} system prompts or starting topics suitable " | |
| f"for generating synthetic conversations between a user and an AI assistant. " | |
| f"Each prompt should be concise (ideally one sentence) and focus on a clear task or subject. " | |
| f"Present each prompt on a new line, with no other introductory or concluding text." | |
| f"\n\nExamples:\n" | |
| f"- Act as a travel agent planning a trip to Japan.\n" | |
| f"- Explain the concept of black holes to a 5-year-old.\n" | |
| f"- Write a python function to reverse a string." | |
| ) | |
| system_msg = "You are an expert prompt generator. Follow the user's instructions precisely." | |
| # Pass the settings down to generate_synthetic_text | |
| generated_text = generate_synthetic_text( | |
| instruction, | |
| model, | |
| system_message=system_msg, | |
| temperature=temperature, | |
| top_p=top_p, | |
| max_tokens=max_tokens | |
| ) | |
| if generated_text.startswith("Error:"): | |
| raise ValueError(generated_text) | |
| # Split into lines and clean up any extra whitespace or empty lines | |
| prompts = [p.strip() for p in generated_text.strip().split('\n') if p.strip()] | |
| prompts = [p.replace("- ", "") for p in prompts] | |
| if not prompts: | |
| # Log the raw generated text if parsing failed | |
| print(f"Warning: Failed to parse prompts from generated text. Raw text:\n{generated_text}") | |
| raise ValueError("AI failed to generate prompts in the expected format.") | |
| # Optional: Truncate or pad if the model didn't generate the exact number | |
| return prompts[:num_prompts] | |
| def generate_synthetic_conversation( | |
| system_prompt: str, | |
| model: str, | |
| num_turns: int, | |
| temperature: Optional[float] = 0.7, # Pass settings through | |
| top_p: Optional[float] = None, | |
| max_tokens: Optional[int] = 1000 # Set a reasonable default max for conversations | |
| ) -> str: | |
| """ | |
| Generates a synthetic conversation with a specified number of turns. | |
| Args: | |
| system_prompt: The initial system prompt defining the context or AI persona. | |
| model: The model ID to use for generation. | |
| num_turns: The desired number of conversational turns (1 turn = 1 User + 1 Assistant). | |
| temperature: Controls randomness (0.0 to 2.0). None means API default. | |
| top_p: Nucleus sampling probability. None means API default. | |
| max_tokens: Maximum number of tokens to generate. None means API default. | |
| Returns: | |
| A string containing the formatted conversation. | |
| """ | |
| # We'll ask the model to generate the whole conversation in one go for simplicity. | |
| # More complex approaches could involve iterative calls. | |
| instruction = ( | |
| f"Generate a realistic conversation between a 'User' and an 'Assistant'. " | |
| f"The conversation should start based on the following system prompt/topic: '{system_prompt}'.\n" | |
| f"The conversation should have approximately {num_turns} pairs of User/Assistant turns.\n" | |
| f"Format the output clearly, starting each line with 'User:' or 'Assistant:'.\n\n" | |
| f"Example Format:\n" | |
| f"User: Hello!\n" | |
| f"Assistant: Hi there! How can I help you today?\n" | |
| f"User: Can you explain photosynthesis?\n" | |
| f"Assistant: Certainly! Photosynthesis is the process..." | |
| ) | |
| # Use the user-provided system prompt for the *conversation's* context, | |
| # but a generic one for the generation *task* itself. | |
| system_msg_for_generation = f"You are an AI assistant simulating a conversation. The context for the conversation you generate is: {system_prompt}" | |
| # Pass the settings down to generate_synthetic_text | |
| conversation_text = generate_synthetic_text( | |
| prompt=instruction, | |
| model=model, | |
| system_message=system_msg_for_generation, | |
| temperature=temperature, | |
| top_p=top_p, | |
| max_tokens=max_tokens | |
| ) | |
| if conversation_text.startswith("Error:"): | |
| # Propagate the error message | |
| return f"Error generating conversation for prompt '{system_prompt}':\n{conversation_text}" | |
| # Basic validation/cleanup (optional) | |
| if not re.search(r"User:|Assistant:", conversation_text, re.IGNORECASE): | |
| print(f"Warning: Generated text for conversation '{system_prompt}' might not be in the expected format. Raw text:\n{conversation_text}") | |
| # Return the raw text anyway, maybe the model format is slightly different | |
| return f"Generated conversation for prompt '{system_prompt}':\n(Format might vary)\n\n{conversation_text}" | |
| return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}" | |
| # Function to generate different types of content based on a topic | |
| def generate_corpus_content( | |
| topic: str, | |
| content_type: str, # e.g., "Corpus Snippets", "Short Story", "Article" | |
| length_param: int, # Meaning depends on type (e.g., num snippets, approx words) | |
| model: str, | |
| system_message_base: str = "You are a helpful assistant generating synthetic content.", | |
| temperature: Optional[float] = 0.7, | |
| top_p: Optional[float] = None, | |
| max_tokens: Optional[int] = None # Use a larger default if None | |
| ) -> str: | |
| """ | |
| Generates different types of synthetic content based on a topic. | |
| Args: | |
| topic: The central topic for the content. | |
| content_type: The type of content to generate. | |
| length_param: A parameter controlling length/quantity (meaning depends on type). | |
| model: The model ID. | |
| system_message_base: Base system message (will be specialized). | |
| temperature: Model temperature. | |
| top_p: Model top_p. | |
| max_tokens: Model max_tokens. | |
| Returns: | |
| The generated content string or an error message. | |
| """ | |
| prompt = "" | |
| system_message = system_message_base # Start with base | |
| # --- Construct Prompt based on Content Type --- | |
| if content_type == "Corpus Snippets": | |
| if length_param <= 0: length_param = 5 # Default number of snippets | |
| prompt = ( | |
| f"Generate exactly {length_param} distinct text snippets related to the topic: '{topic}'. " | |
| f"Each snippet should be a few sentences long and focus on a different aspect if possible. " | |
| f"Present each snippet clearly, perhaps separated by a blank line or a marker like '---'." | |
| ) | |
| system_message = "You are an AI generating diverse text snippets for a data corpus." | |
| # Adjust max_tokens based on expected number of snippets if not set | |
| if max_tokens is None: max_tokens = length_param * 150 # Estimate | |
| elif content_type == "Short Story": | |
| if length_param <= 0: length_param = 300 # Default approx words | |
| prompt = ( | |
| f"Write a short story (approximately {length_param} words) centered around the topic: '{topic}'. " | |
| f"The story should have a clear beginning, middle, and end." | |
| ) | |
| system_message = "You are a creative AI writing a short story." | |
| if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate | |
| elif content_type == "Article": | |
| if length_param <= 0: length_param = 500 # Default approx words | |
| prompt = ( | |
| f"Write an informative article (approximately {length_param} words) about the topic: '{topic}'. " | |
| f"The article should be well-structured, factual (to the best of your ability), and engaging." | |
| ) | |
| system_message = "You are an AI assistant writing an informative article." | |
| if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate | |
| else: | |
| return f"Error: Unknown content type '{content_type}'." | |
| if not prompt: | |
| return "Error: Could not construct a valid prompt." | |
| # --- Call the core generation function --- | |
| generated_text = generate_synthetic_text( | |
| prompt=prompt, | |
| model=model, | |
| system_message=system_message, | |
| temperature=temperature, | |
| top_p=top_p, | |
| max_tokens=max_tokens | |
| ) | |
| # Return the result (includes potential errors from generate_synthetic_text) | |
| # Add a title for clarity | |
| if not generated_text.startswith("Error:"): | |
| return f"Generated {content_type} for topic '{topic}':\n\n{generated_text}" | |
| else: | |
| return generated_text # Propagate the error | |
| # --- Main Execution (Example Usage) --- | |
| if __name__ == "__main__": | |
| print("--- Testing Basic Text Generation ---") | |
| test_prompt = "Describe the benefits of using synthetic data." | |
| text_result = generate_synthetic_text(test_prompt, temperature=0.5, max_tokens=100) # Example with settings | |
| print(f"Prompt: {test_prompt}\nResult:\n{text_result}\n") | |
| print("\n--- Testing Prompt Generation ---") | |
| try: | |
| num_prompts_to_gen = 3 | |
| prompts_result = generate_prompts(num_prompts_to_gen, "deepseek/deepseek-chat-v3-0324:free") | |
| print(f"Generated {len(prompts_result)} prompts:") | |
| for i, p in enumerate(prompts_result): | |
| print(f"{i+1}. {p}") | |
| except ValueError as e: | |
| print(f"Error generating prompts: {e}") | |
| print("\n--- Testing Conversation Generation ---") | |
| conv_prompt = "Act as a helpful expert explaining the difference between nuclear fission and fusion." | |
| num_conv_turns = 3 | |
| conv_result = generate_synthetic_conversation(conv_prompt, "deepseek/deepseek-chat-v3-0324:free", num_conv_turns) | |
| print(f"{conv_result}\n") | |
| print("\n--- Testing with Invalid API Key (if applicable) ---") | |
| # Temporarily use an invalid key for testing error handling | |
| original_key = client.api_key | |
| client.api_key = "invalid-key" | |
| error_text_result = generate_synthetic_text("Test prompt") | |
| print(f"Result with invalid key: {error_text_result}") | |
| client.api_key = original_key # Restore original key | |
| print("\nGeneration tests complete.") | |