| | |
| | """ |
| | Generate the FunctionGemma evaluation benchmark. |
| | |
| | Creates 100 high-quality samples to assess function-calling accuracy across: |
| | - SEARCH_TOKEN calls |
| | - EXECUTE_SWAP calls |
| | - Incomplete requests (should ask back) |
| | - Irrelevant requests (should refuse) |
| | """ |
| |
|
| | import json |
| | import random |
| | import argparse |
| | from pathlib import Path |
| | from typing import Dict, List, Any, Optional |
| |
|
| | PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| | DEFAULT_BENCHMARK_PATH = PROJECT_ROOT / "data" / "benchmark_dataset.json" |
| |
|
| | |
| | TOKENS = { |
| | "SOL": {"ca": "So11111111111111111111111111111111111111112", "chain": "solana"}, |
| | "USDC": {"ca": "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", "chain": "solana"}, |
| | "JUP": {"ca": "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", "chain": "solana"}, |
| | "RAY": {"ca": "4k3Dyjzvzp8eMZWUXbBCjEvwSkkk59S5iCNLY3QrkX6R", "chain": "solana"}, |
| | "BONK": {"ca": "DezXAZ8z7PnrnRJjz3wXBoRgixCa6xjnB7YaB1pPB263", "chain": "solana"}, |
| | "WIF": {"ca": "EKpQGSJtjMFqKZ9KQanSqYXRcF8fBopzLHYxdM65zcjm", "chain": "solana"}, |
| | "ETH": {"ca": "7vfCXTUXx5WJV5JADk17DUJ4ksgau7utNKj4b963voxs", "chain": "solana"}, |
| | "BTC": {"ca": "9n4nbM75f5Ui33ZbPYXn59EwSgE8CGsHtAeTH5YFeJ9E", "chain": "solana"}, |
| | "POPCAT": {"ca": "7GCihgDB8fe6KNjn2MYtkzZcRjQy3t9GHdC8uHYmW2hr", "chain": "solana"}, |
| | "TRUMP": {"ca": "6p6xgHyF7AeE6TZkSmFsko444wqoP15icUSqi2jfGiPN", "chain": "solana"}, |
| | } |
| |
|
| | CHAINS = ["solana", "ethereum", "bsc", "base"] |
| |
|
| | |
| | TOOLS = [ |
| | { |
| | "type": "function", |
| | "function": { |
| | "name": "SEARCH_TOKEN", |
| | "description": "search token onchain", |
| | "parameters": { |
| | "type": "object", |
| | "properties": { |
| | "symbol": {"type": ["string", "null"], "description": "Symbol of the token"}, |
| | "address": {"type": ["string", "null"], "description": "Contract address of the token"}, |
| | "chain": {"type": "string", "enum": ["solana", "ethereum", "bsc", "base"], "description": "supported chains"}, |
| | "keyword": {"type": ["string", "null"], "description": "keyword to search for the token"} |
| | }, |
| | "required": [] |
| | } |
| | } |
| | }, |
| | { |
| | "type": "function", |
| | "function": { |
| | "name": "EXECUTE_SWAP", |
| | "description": "Swap tokens on the Solana blockchain. When the user specifies 'buy <token>', the default input token is SOL. When the user specifies 'sell <token>', the default output token is SOL.", |
| | "parameters": { |
| | "type": "object", |
| | "properties": { |
| | "inputTokenSymbol": {"type": ["string", "null"], "description": "Symbol of the token to sell."}, |
| | "inputTokenCA": {"type": ["string", "null"], "description": "Contract address of the token to sell."}, |
| | "outputTokenCA": {"type": ["string", "null"], "description": "Contract address of the token to buy."}, |
| | "inputTokenAmount": {"type": ["string", "null"], "description": "Exact amount of the input token to swap."}, |
| | "inputTokenPercentage": {"type": ["number", "null"], "description": "Percentage of the input token balance to swap."}, |
| | "outputTokenAmount": {"type": ["string", "null"], "description": "Expected amount of the output token to receive."} |
| | }, |
| | "required": ["inputTokenCA", "outputTokenCA", "inputTokenAmount", "inputTokenPercentage"] |
| | } |
| | } |
| | } |
| | ] |
| |
|
| |
|
| | def create_benchmark_item( |
| | user_input: str, |
| | expected_function: Optional[str], |
| | expected_args: Optional[Dict] = None, |
| | category: str = "function_call", |
| | description: str = "" |
| | ) -> Dict: |
| | """Create one benchmark sample.""" |
| | return { |
| | "id": None, |
| | "category": category, |
| | "description": description, |
| | "input": { |
| | "messages": [ |
| | {"role": "developer", "content": "You are a model that can do function calling with the following functions"}, |
| | {"role": "user", "content": user_input} |
| | ], |
| | "tools": TOOLS |
| | }, |
| | "expected": { |
| | "function_name": expected_function, |
| | "arguments": expected_args |
| | } |
| | } |
| |
|
| |
|
| | def generate_search_token_benchmarks() -> List[Dict]: |
| | """Generate SEARCH_TOKEN cases.""" |
| | benchmarks = [] |
| | |
| | |
| | test_cases = [ |
| | ("Search for BONK token", "BONK", "solana", None, None), |
| | ("Find WIF on solana", "WIF", "solana", None, None), |
| | ("Look up JUP token", "JUP", "solana", None, None), |
| | ("Search ETH on ethereum", "ETH", "ethereum", None, None), |
| | ("Find USDC token on base", "USDC", "base", None, None), |
| | ] |
| | |
| | for query, symbol, chain, address, keyword in test_cases: |
| | expected_args = {"symbol": symbol, "chain": chain} |
| | if address: |
| | expected_args["address"] = address |
| | if keyword: |
| | expected_args["keyword"] = keyword |
| | benchmarks.append(create_benchmark_item( |
| | query, "SEARCH_TOKEN", expected_args, |
| | "search_by_symbol", f"Search {symbol} by symbol" |
| | )) |
| | |
| | |
| | cn_cases = [ |
| | ("帮我搜索 BONK 代币", "BONK", "solana"), |
| | ("查一下 WIF 这个币", "WIF", "solana"), |
| | ("找一下 JUP 代币信息", "JUP", "solana"), |
| | ("搜索 RAY 代币", "RAY", "solana"), |
| | ("查询 POPCAT 代币", "POPCAT", "solana"), |
| | ] |
| | |
| | for query, symbol, chain in cn_cases: |
| | benchmarks.append(create_benchmark_item( |
| | query, "SEARCH_TOKEN", {"symbol": symbol, "chain": chain}, |
| | "search_by_symbol_cn", f"Search {symbol} by symbol (Chinese)" |
| | )) |
| | |
| | |
| | for token, info in list(TOKENS.items())[:5]: |
| | query = f"Search token at address {info['ca']}" |
| | benchmarks.append(create_benchmark_item( |
| | query, "SEARCH_TOKEN", {"address": info['ca'], "chain": info['chain']}, |
| | "search_by_address", f"Search {token} by address" |
| | )) |
| | |
| | |
| | keyword_cases = [ |
| | ("Search for dog themed tokens", "dog", "solana"), |
| | ("Find meme coins", "meme", "solana"), |
| | ("Look for cat tokens on base", "cat", "base"), |
| | ] |
| | |
| | for query, keyword, chain in keyword_cases: |
| | benchmarks.append(create_benchmark_item( |
| | query, "SEARCH_TOKEN", {"keyword": keyword, "chain": chain}, |
| | "search_by_keyword", f"Search by keyword: {keyword}" |
| | )) |
| | |
| | return benchmarks |
| |
|
| |
|
| | def generate_execute_swap_benchmarks() -> List[Dict]: |
| | """Generate EXECUTE_SWAP cases.""" |
| | benchmarks = [] |
| | |
| | |
| | buy_cases = [ |
| | ("Buy 1 SOL worth of BONK", "SOL", "BONK", "1", None), |
| | ("Purchase 5 SOL of WIF", "SOL", "WIF", "5", None), |
| | ("Buy 10 USDC worth of JUP", "USDC", "JUP", "10", None), |
| | ("I want to buy 2 SOL of RAY", "SOL", "RAY", "2", None), |
| | ("Get me 0.5 SOL of POPCAT", "SOL", "POPCAT", "0.5", None), |
| | ] |
| | |
| | for query, input_token, output_token, amount, percentage in buy_cases: |
| | input_ca = TOKENS[input_token]["ca"] |
| | output_ca = TOKENS[output_token]["ca"] |
| | benchmarks.append(create_benchmark_item( |
| | query, "EXECUTE_SWAP", |
| | {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| | "buy_with_amount", f"Buy {output_token} with {amount} {input_token}" |
| | )) |
| | |
| | |
| | buy_pct_cases = [ |
| | ("Buy BONK with 50% of my SOL", "SOL", "BONK", None, 0.5), |
| | ("Use 30% of my USDC to buy WIF", "USDC", "WIF", None, 0.3), |
| | ("Spend 100% of my SOL on JUP", "SOL", "JUP", None, 1.0), |
| | ("Put 25% of my ETH into RAY", "ETH", "RAY", None, 0.25), |
| | ("Use half of my BTC to get BONK", "BTC", "BONK", None, 0.5), |
| | ] |
| | |
| | for query, input_token, output_token, amount, percentage in buy_pct_cases: |
| | input_ca = TOKENS[input_token]["ca"] |
| | output_ca = TOKENS[output_token]["ca"] |
| | benchmarks.append(create_benchmark_item( |
| | query, "EXECUTE_SWAP", |
| | {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| | "buy_with_percentage", f"Buy {output_token} with {int(percentage*100)}% {input_token}" |
| | )) |
| | |
| | |
| | sell_cases = [ |
| | ("Sell 1000 BONK", "BONK", "SOL", "1000", None), |
| | ("Sell 500 WIF for SOL", "WIF", "SOL", "500", None), |
| | ("Convert 100 JUP to SOL", "JUP", "SOL", "100", None), |
| | ("Dump 2000 RAY", "RAY", "SOL", "2000", None), |
| | ("Sell 50 USDC", "USDC", "SOL", "50", None), |
| | ] |
| | |
| | for query, input_token, output_token, amount, percentage in sell_cases: |
| | input_ca = TOKENS[input_token]["ca"] |
| | output_ca = TOKENS[output_token]["ca"] |
| | benchmarks.append(create_benchmark_item( |
| | query, "EXECUTE_SWAP", |
| | {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| | "sell_with_amount", f"Sell {amount} {input_token}" |
| | )) |
| | |
| | |
| | sell_pct_cases = [ |
| | ("Sell 50% of my BONK", "BONK", "SOL", None, 0.5), |
| | ("Dump all my WIF", "WIF", "SOL", None, 1.0), |
| | ("Sell 30% of my JUP holdings", "JUP", "SOL", None, 0.3), |
| | ("Get rid of 75% of my RAY", "RAY", "SOL", None, 0.75), |
| | ("Sell a quarter of my USDC", "USDC", "SOL", None, 0.25), |
| | ] |
| | |
| | for query, input_token, output_token, amount, percentage in sell_pct_cases: |
| | input_ca = TOKENS[input_token]["ca"] |
| | output_ca = TOKENS[output_token]["ca"] |
| | benchmarks.append(create_benchmark_item( |
| | query, "EXECUTE_SWAP", |
| | {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| | "sell_with_percentage", f"Sell {int(percentage*100)}% {input_token}" |
| | )) |
| | |
| | |
| | cn_swap_cases = [ |
| | ("用 1 个 SOL 买 BONK", "SOL", "BONK", "1", None), |
| | ("把 50% 的 USDC 换成 WIF", "USDC", "WIF", None, 0.5), |
| | ("卖掉 1000 个 BONK", "BONK", "SOL", "1000", None), |
| | ("把所有 JUP 都卖了", "JUP", "SOL", None, 1.0), |
| | ("用 2 SOL 购买 RAY", "SOL", "RAY", "2", None), |
| | ("出售 30% 的 WIF", "WIF", "SOL", None, 0.3), |
| | ("买入 5 SOL 的 POPCAT", "SOL", "POPCAT", "5", None), |
| | ("清仓 ETH", "ETH", "SOL", None, 1.0), |
| | ] |
| | |
| | for query, input_token, output_token, amount, percentage in cn_swap_cases: |
| | input_ca = TOKENS[input_token]["ca"] |
| | output_ca = TOKENS[output_token]["ca"] |
| | benchmarks.append(create_benchmark_item( |
| | query, "EXECUTE_SWAP", |
| | {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| | "swap_chinese", f"Swap request in Chinese" |
| | )) |
| | |
| | |
| | swap_cases = [ |
| | ("Swap 100 USDC for BONK", "USDC", "BONK", "100", None), |
| | ("Exchange 50 JUP for WIF", "JUP", "WIF", "50", None), |
| | ("Convert all my ETH to USDC", "ETH", "USDC", None, 1.0), |
| | ] |
| | |
| | for query, input_token, output_token, amount, percentage in swap_cases: |
| | input_ca = TOKENS[input_token]["ca"] |
| | output_ca = TOKENS[output_token]["ca"] |
| | benchmarks.append(create_benchmark_item( |
| | query, "EXECUTE_SWAP", |
| | {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| | "token_to_token", f"Swap {input_token} to {output_token}" |
| | )) |
| | |
| | return benchmarks |
| |
|
| |
|
| | def generate_incomplete_benchmarks() -> List[Dict]: |
| | """Generate incomplete requests (should ask clarification).""" |
| | benchmarks = [] |
| | |
| | incomplete_cases = [ |
| | ("I want to buy some tokens", "incomplete_no_token", "Missing token name"), |
| | ("Sell my holdings", "incomplete_no_token", "Missing which token to sell"), |
| | ("Search for a token", "incomplete_no_info", "Missing token info"), |
| | ("Buy something", "incomplete_vague", "Too vague"), |
| | ("我想买币", "incomplete_cn", "Missing token (Chinese)"), |
| | ("帮我卖掉", "incomplete_cn", "Missing token and amount (Chinese)"), |
| | ("Swap tokens", "incomplete_swap", "Missing swap details"), |
| | ("I want to trade", "incomplete_trade", "Missing trade details"), |
| | ] |
| | |
| | for query, category, description in incomplete_cases: |
| | benchmarks.append(create_benchmark_item( |
| | query, None, None, category, description |
| | )) |
| | |
| | return benchmarks |
| |
|
| |
|
| | def generate_irrelevant_benchmarks() -> List[Dict]: |
| | """Generate irrelevant requests (should not call any function).""" |
| | benchmarks = [] |
| | |
| | irrelevant_cases = [ |
| | ("What's the weather today?", "irrelevant_weather", "Weather query"), |
| | ("Tell me a joke", "irrelevant_joke", "Joke request"), |
| | ("What time is it?", "irrelevant_time", "Time query"), |
| | ("Who is the president?", "irrelevant_general", "General knowledge"), |
| | ("今天天气怎么样?", "irrelevant_cn", "Weather (Chinese)"), |
| | ("给我讲个笑话", "irrelevant_cn", "Joke (Chinese)"), |
| | ("Hello, how are you?", "irrelevant_greeting", "Greeting"), |
| | ("What is Bitcoin?", "irrelevant_info", "Info request (no action)"), |
| | ] |
| | |
| | for query, category, description in irrelevant_cases: |
| | benchmarks.append(create_benchmark_item( |
| | query, None, None, category, description |
| | )) |
| | |
| | return benchmarks |
| |
|
| |
|
| | def generate_benchmark_dataset(output_path: str = str(DEFAULT_BENCHMARK_PATH)): |
| | """Generate the full benchmark dataset.""" |
| | |
| | print("=" * 60) |
| | print("Generating FunctionGemma benchmark dataset") |
| | print("=" * 60) |
| | |
| | |
| | all_benchmarks = [] |
| | |
| | |
| | search_benchmarks = generate_search_token_benchmarks() |
| | print(f"SEARCH_TOKEN cases: {len(search_benchmarks)}") |
| | all_benchmarks.extend(search_benchmarks) |
| | |
| | |
| | swap_benchmarks = generate_execute_swap_benchmarks() |
| | print(f"EXECUTE_SWAP cases: {len(swap_benchmarks)}") |
| | all_benchmarks.extend(swap_benchmarks) |
| | |
| | |
| | incomplete_benchmarks = generate_incomplete_benchmarks() |
| | print(f"Incomplete request cases: {len(incomplete_benchmarks)}") |
| | all_benchmarks.extend(incomplete_benchmarks) |
| | |
| | |
| | irrelevant_benchmarks = generate_irrelevant_benchmarks() |
| | print(f"Irrelevant request cases: {len(irrelevant_benchmarks)}") |
| | all_benchmarks.extend(irrelevant_benchmarks) |
| | |
| | |
| | while len(all_benchmarks) < 100: |
| | |
| | extra_cases = [ |
| | ("Buy 3 SOL of TRUMP", "SOL", "TRUMP", "3", None, "EXECUTE_SWAP"), |
| | ("Search for TRUMP token", "TRUMP", "solana", None, None, "SEARCH_TOKEN"), |
| | ] |
| | for case in extra_cases: |
| | if len(all_benchmarks) >= 100: |
| | break |
| | if case[5] == "EXECUTE_SWAP": |
| | input_ca = TOKENS[case[1]]["ca"] |
| | output_ca = TOKENS[case[2]]["ca"] |
| | all_benchmarks.append(create_benchmark_item( |
| | case[0], "EXECUTE_SWAP", |
| | {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": case[3], "inputTokenPercentage": case[4]}, |
| | "extra", "Extra test case" |
| | )) |
| | else: |
| | all_benchmarks.append(create_benchmark_item( |
| | case[0], "SEARCH_TOKEN", |
| | {"symbol": case[1], "chain": case[2]}, |
| | "extra", "Extra test case" |
| | )) |
| | |
| | |
| | all_benchmarks = all_benchmarks[:100] |
| | |
| | |
| | for i, item in enumerate(all_benchmarks): |
| | item["id"] = i + 1 |
| | |
| | |
| | random.seed(42) |
| | random.shuffle(all_benchmarks) |
| | |
| | |
| | for i, item in enumerate(all_benchmarks): |
| | item["id"] = i + 1 |
| | |
| | print(f"\nTotal: {len(all_benchmarks)} cases") |
| | |
| | |
| | categories = {} |
| | for item in all_benchmarks: |
| | cat = item["category"] |
| | categories[cat] = categories.get(cat, 0) + 1 |
| | |
| | print("\nCategory distribution:") |
| | for cat, count in sorted(categories.items()): |
| | print(f" - {cat}: {count}") |
| | |
| | |
| | func_counts = {"SEARCH_TOKEN": 0, "EXECUTE_SWAP": 0, "None": 0} |
| | for item in all_benchmarks: |
| | func = item["expected"]["function_name"] |
| | if func: |
| | func_counts[func] = func_counts.get(func, 0) + 1 |
| | else: |
| | func_counts["None"] += 1 |
| | |
| | print("\nFunction distribution:") |
| | for func, count in func_counts.items(): |
| | print(f" - {func}: {count}") |
| | |
| | |
| | with open(output_path, 'w', encoding='utf-8') as f: |
| | json.dump(all_benchmarks, f, ensure_ascii=False, indent=2) |
| | |
| | print(f"\nBenchmark saved to: {output_path}") |
| | |
| | |
| | print("\n" + "=" * 60) |
| | print("Examples:") |
| | print("=" * 60) |
| | |
| | for i, item in enumerate(all_benchmarks[:3]): |
| | print(f"\n--- Example {i+1} ---") |
| | print(f"ID: {item['id']}") |
| | print(f"Category: {item['category']}") |
| | print(f"Input: {item['input']['messages'][1]['content']}") |
| | print(f"Expected function: {item['expected']['function_name']}") |
| | if item['expected']['arguments']: |
| | print(f"Expected args: {json.dumps(item['expected']['arguments'], ensure_ascii=False)}") |
| | |
| | return all_benchmarks |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Generate FunctionGemma benchmark dataset") |
| | parser.add_argument("--output", type=str, default=str(DEFAULT_BENCHMARK_PATH), help="Output file path") |
| | args = parser.parse_args() |
| | |
| | output_path = Path(args.output) |
| | output_path.parent.mkdir(parents=True, exist_ok=True) |
| | |
| | generate_benchmark_dataset(str(output_path)) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|