Spaces:

ReallyFloppyPenguin
/

SynthGen

Running

App Files Files Community

SynthGen / synthgen.py

ReallyFloppyPenguin

Update synthgen.py

74b8bd8 verified 8 months ago

raw

history blame contribute delete

13.6 kB

	import os
	from openai import OpenAI
	import re # Import regex for parsing conversation turns
	from typing import Optional, Union # Need Optional for settings

	# Ensure the OPENROUTER_API_KEY environment variable is set
	api_key = "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"
	if not api_key:
	raise ValueError("OPENROUTER_API_KEY environment variable not set.")

	# Point the OpenAI client to the OpenRouter API
	client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key,
	)

	# --- Core Generation Functions ---

	def generate_synthetic_text(
	prompt: str,
	model: str = "deepseek/deepseek-chat-v3-0324:free",
	system_message: str = "You are a helpful assistant generating synthetic data.",
	temperature: Optional[float] = 0.7, # Default temperature
	top_p: Optional[float] = None, # Default top_p (let API decide if None)
	max_tokens: Optional[int] = None # Default max_tokens (let API decide if None)
	) -> str:
	"""
	Generates synthetic text using an OpenRouter model via Chat Completions,
	including model parameter controls.

	Args:
	prompt: The user's input prompt.
	model: The model ID.
	system_message: The system message context.
	temperature: Controls randomness (0.0 to 2.0). None means API default.
	top_p: Nucleus sampling probability. None means API default.
	max_tokens: Maximum number of tokens to generate. None means API default.

	Returns:
	The generated text string or an error message.
	"""
	if not api_key or api_key == "YOUR_API_KEY_HERE_OR_SET_ENV_VAR":
	return "Error: OPENROUTER_API_KEY not configured properly. Please set the environment variable."

	# Prepare parameters, only including them if they are not None
	params = {
	"model": model,
	"messages": [
	{"role": "system", "content": system_message},
	{"role": "user", "content": prompt},
	],
	"extra_headers": {
	# "HTTP-Referer": "YOUR_SITE_URL",
	"X-Title": "SynthGen",
	}
	}
	if temperature is not None:
	params["temperature"] = temperature
	if top_p is not None:
	params["top_p"] = top_p
	if max_tokens is not None:
	params["max_tokens"] = max_tokens

	try:
	response = client.chat.completions.create(**params) # Use dictionary unpacking

	if response.choices and response.choices[0].message and response.choices[0].message.content:
	return response.choices[0].message.content.strip()
	else:
	print(f"Warning: No content in response for model {model}. Response: {response}")
	return "Error: No content generated by the model."
	except Exception as e:
	print(f"Error during API call to model {model}: {e}")
	return f"Error during API call: {e}"

	def generate_prompts(
	num_prompts: int,
	model: str,
	topic_hint: str = "diverse and interesting",
	temperature: Optional[float] = 0.7, # Pass settings through
	top_p: Optional[float] = None,
	max_tokens: Optional[int] = 200 # Set a reasonable default max for prompts
	) -> list[str]:
	"""
	Generates a list of conversation prompts using an AI model.

	Args:
	num_prompts: The number of prompts to generate.
	model: The model ID to use for generation.
	topic_hint: Optional hint for the kind of topics (e.g., "related to technology").
	temperature: Controls randomness (0.0 to 2.0). None means API default.
	top_p: Nucleus sampling probability. None means API default.
	max_tokens: Maximum number of tokens to generate. None means API default.

	Returns:
	A list of generated prompts.
	"""
	instruction = (
	f"Generate exactly {num_prompts} unique, {topic_hint} system prompts or starting topics suitable "
	f"for generating synthetic conversations between a user and an AI assistant. "
	f"Each prompt should be concise (ideally one sentence) and focus on a clear task or subject. "
	f"Present each prompt on a new line, with no other introductory or concluding text."
	f"\n\nExamples:\n"
	f"- Act as a travel agent planning a trip to Japan.\n"
	f"- Explain the concept of black holes to a 5-year-old.\n"
	f"- Write a python function to reverse a string."
	)
	system_msg = "You are an expert prompt generator. Follow the user's instructions precisely."

	# Pass the settings down to generate_synthetic_text
	generated_text = generate_synthetic_text(
	instruction,
	model,
	system_message=system_msg,
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens
	)

	if generated_text.startswith("Error:"):
	raise ValueError(generated_text)

	# Split into lines and clean up any extra whitespace or empty lines
	prompts = [p.strip() for p in generated_text.strip().split('\n') if p.strip()]
	prompts = [p.replace("- ", "") for p in prompts]
	if not prompts:
	# Log the raw generated text if parsing failed
	print(f"Warning: Failed to parse prompts from generated text. Raw text:\n{generated_text}")
	raise ValueError("AI failed to generate prompts in the expected format.")

	# Optional: Truncate or pad if the model didn't generate the exact number
	return prompts[:num_prompts]


	def generate_synthetic_conversation(
	system_prompt: str,
	model: str,
	num_turns: int,
	temperature: Optional[float] = 0.7, # Pass settings through
	top_p: Optional[float] = None,
	max_tokens: Optional[int] = 1000 # Set a reasonable default max for conversations
	) -> str:
	"""
	Generates a synthetic conversation with a specified number of turns.

	Args:
	system_prompt: The initial system prompt defining the context or AI persona.
	model: The model ID to use for generation.
	num_turns: The desired number of conversational turns (1 turn = 1 User + 1 Assistant).
	temperature: Controls randomness (0.0 to 2.0). None means API default.
	top_p: Nucleus sampling probability. None means API default.
	max_tokens: Maximum number of tokens to generate. None means API default.

	Returns:
	A string containing the formatted conversation.
	"""
	# We'll ask the model to generate the whole conversation in one go for simplicity.
	# More complex approaches could involve iterative calls.
	instruction = (
	f"Generate a realistic conversation between a 'User' and an 'Assistant'. "
	f"The conversation should start based on the following system prompt/topic: '{system_prompt}'.\n"
	f"The conversation should have approximately {num_turns} pairs of User/Assistant turns.\n"
	f"Format the output clearly, starting each line with 'User:' or 'Assistant:'.\n\n"
	f"Example Format:\n"
	f"User: Hello!\n"
	f"Assistant: Hi there! How can I help you today?\n"
	f"User: Can you explain photosynthesis?\n"
	f"Assistant: Certainly! Photosynthesis is the process..."
	)

	# Use the user-provided system prompt for the conversation's context,
	# but a generic one for the generation task itself.
	system_msg_for_generation = f"You are an AI assistant simulating a conversation. The context for the conversation you generate is: {system_prompt}"

	# Pass the settings down to generate_synthetic_text
	conversation_text = generate_synthetic_text(
	prompt=instruction,
	model=model,
	system_message=system_msg_for_generation,
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens
	)

	if conversation_text.startswith("Error:"):
	# Propagate the error message
	return f"Error generating conversation for prompt '{system_prompt}':\n{conversation_text}"

	# Basic validation/cleanup (optional)
	if not re.search(r"User:\|Assistant:", conversation_text, re.IGNORECASE):
	print(f"Warning: Generated text for conversation '{system_prompt}' might not be in the expected format. Raw text:\n{conversation_text}")
	# Return the raw text anyway, maybe the model format is slightly different
	return f"Generated conversation for prompt '{system_prompt}':\n(Format might vary)\n\n{conversation_text}"

	return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"

	# Function to generate different types of content based on a topic
	def generate_corpus_content(
	topic: str,
	content_type: str, # e.g., "Corpus Snippets", "Short Story", "Article"
	length_param: int, # Meaning depends on type (e.g., num snippets, approx words)
	model: str,
	system_message_base: str = "You are a helpful assistant generating synthetic content.",
	temperature: Optional[float] = 0.7,
	top_p: Optional[float] = None,
	max_tokens: Optional[int] = None # Use a larger default if None
	) -> str:
	"""
	Generates different types of synthetic content based on a topic.

	Args:
	topic: The central topic for the content.
	content_type: The type of content to generate.
	length_param: A parameter controlling length/quantity (meaning depends on type).
	model: The model ID.
	system_message_base: Base system message (will be specialized).
	temperature: Model temperature.
	top_p: Model top_p.
	max_tokens: Model max_tokens.

	Returns:
	The generated content string or an error message.
	"""

	prompt = ""
	system_message = system_message_base # Start with base

	# --- Construct Prompt based on Content Type ---
	if content_type == "Corpus Snippets":
	if length_param <= 0: length_param = 5 # Default number of snippets
	prompt = (
	f"Generate exactly {length_param} distinct text snippets related to the topic: '{topic}'. "
	f"Each snippet should be a few sentences long and focus on a different aspect if possible. "
	f"Present each snippet clearly, perhaps separated by a blank line or a marker like '---'."
	)
	system_message = "You are an AI generating diverse text snippets for a data corpus."
	# Adjust max_tokens based on expected number of snippets if not set
	if max_tokens is None: max_tokens = length_param * 150 # Estimate

	elif content_type == "Short Story":
	if length_param <= 0: length_param = 300 # Default approx words
	prompt = (
	f"Write a short story (approximately {length_param} words) centered around the topic: '{topic}'. "
	f"The story should have a clear beginning, middle, and end."
	)
	system_message = "You are a creative AI writing a short story."
	if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate

	elif content_type == "Article":
	if length_param <= 0: length_param = 500 # Default approx words
	prompt = (
	f"Write an informative article (approximately {length_param} words) about the topic: '{topic}'. "
	f"The article should be well-structured, factual (to the best of your ability), and engaging."
	)
	system_message = "You are an AI assistant writing an informative article."
	if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate

	else:
	return f"Error: Unknown content type '{content_type}'."

	if not prompt:
	return "Error: Could not construct a valid prompt."

	# --- Call the core generation function ---
	generated_text = generate_synthetic_text(
	prompt=prompt,
	model=model,
	system_message=system_message,
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens
	)

	# Return the result (includes potential errors from generate_synthetic_text)
	# Add a title for clarity
	if not generated_text.startswith("Error:"):
	return f"Generated {content_type} for topic '{topic}':\n\n{generated_text}"
	else:
	return generated_text # Propagate the error

	# --- Main Execution (Example Usage) ---
	if __name__ == "__main__":
	print("--- Testing Basic Text Generation ---")
	test_prompt = "Describe the benefits of using synthetic data."
	text_result = generate_synthetic_text(test_prompt, temperature=0.5, max_tokens=100) # Example with settings
	print(f"Prompt: {test_prompt}\nResult:\n{text_result}\n")

	print("\n--- Testing Prompt Generation ---")
	try:
	num_prompts_to_gen = 3
	prompts_result = generate_prompts(num_prompts_to_gen, "deepseek/deepseek-chat-v3-0324:free")
	print(f"Generated {len(prompts_result)} prompts:")
	for i, p in enumerate(prompts_result):
	print(f"{i+1}. {p}")
	except ValueError as e:
	print(f"Error generating prompts: {e}")

	print("\n--- Testing Conversation Generation ---")
	conv_prompt = "Act as a helpful expert explaining the difference between nuclear fission and fusion."
	num_conv_turns = 3
	conv_result = generate_synthetic_conversation(conv_prompt, "deepseek/deepseek-chat-v3-0324:free", num_conv_turns)
	print(f"{conv_result}\n")

	print("\n--- Testing with Invalid API Key (if applicable) ---")
	# Temporarily use an invalid key for testing error handling
	original_key = client.api_key
	client.api_key = "invalid-key"
	error_text_result = generate_synthetic_text("Test prompt")
	print(f"Result with invalid key: {error_text_result}")
	client.api_key = original_key # Restore original key

	print("\nGeneration tests complete.")