| | import pandas as pd |
| | import os |
| | import fnmatch |
| | import json |
| | import re |
| | import numpy as np |
| | import requests |
| | from urllib.parse import quote |
| | from datetime import datetime |
| | import uuid |
| |
|
| |
|
| |
|
| | class DetailsDataProcessor: |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | def __init__(self, directory='results', pattern='results*.json'): |
| | self.directory = directory |
| | self.pattern = pattern |
| | if not os.path.exists('details_data'): |
| | os.makedirs('details_data') |
| |
|
| |
|
| | def _find_files(self, directory='results', pattern='results*.json'): |
| | matching_files = [] |
| | for root, dirs, files in os.walk(directory): |
| | for basename in files: |
| | if fnmatch.fnmatch(basename, pattern): |
| | filename = os.path.join(root, basename) |
| | matching_files.append(filename) |
| | return matching_files |
| |
|
| | @staticmethod |
| | def download_file(url, directory='details_data'): |
| | |
| | segments = url.split('/') |
| | organization = segments[-3] |
| | model_name = segments[-2] |
| | task = url.split('%7ChendrycksTest-')[1].split('%7C')[0] |
| |
|
| | |
| | safe_file_name = f"{organization}_{model_name}_{task}.json" |
| |
|
| | |
| | save_file_path = os.path.join(directory, safe_file_name) |
| |
|
| | error_count = 0 |
| | success_count = 0 |
| | try: |
| | |
| | r = requests.get(url, allow_redirects=True) |
| | r.raise_for_status() |
| |
|
| | |
| | with open(save_file_path, 'wb') as file: |
| | file.write(r.content) |
| | print(save_file_path) |
| |
|
| | success_count += 1 |
| | except requests.ConnectionError as e: |
| | error_count += 1 |
| | except requests.HTTPError as e: |
| | error_count += 1 |
| | except FileNotFoundError as e: |
| | error_count += 1 |
| | except Exception as e: |
| | error_count += 1 |
| |
|
| | return error_count, success_count |
| |
|
| |
|
| |
|
| | @staticmethod |
| | def single_file_pipeline(url, filename): |
| | DetailsDataProcessor.download_file(url, filename) |
| | |
| | with open(filename) as f: |
| | data = json.load(f) |
| | |
| | df = pd.DataFrame(data) |
| | return df |
| |
|
| | @staticmethod |
| | def build_url(file_path): |
| | segments = file_path.split('/') |
| | bits = segments[1] |
| | model_name = segments[2] |
| |
|
| | try: |
| | timestamp = segments[3].split('_')[1] |
| | except IndexError: |
| | print(f"Error: {file_path}") |
| | return None |
| | |
| | url = f'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/{bits}/{model_name}/details_harness%7ChendrycksTest-moral_scenarios%7C5_{quote(timestamp, safe="")}' |
| | return url |
| | |
| | def pipeline(self): |
| | error_count = 0 |
| | success_count = 0 |
| | file_paths = self._find_files(self.directory, self.pattern) |
| | for file_path in file_paths: |
| | print(f"Processing file path: {file_path}") |
| | url = self.build_url(file_path) |
| | if url: |
| | errors, successes = self.download_file(url) |
| | error_count += errors |
| | success_count += successes |
| | else: |
| | print(f"Error building URL for file path: {file_path}") |
| | error_count += 1 |
| |
|
| | print(f"Downloaded {success_count} files successfully. Encountered {error_count} errors.") |
| | return success_count, error_count |
| |
|
| |
|