| | |
| | |
| | |
| |
|
| | import argparse |
| | import json |
| | import subprocess |
| | import time |
| | from pathlib import Path |
| | from datetime import datetime |
| |
|
| | |
| | LANGUAGES = [ |
| | "python", "node", "go", "java", "rust", "php", |
| | "ruby", "typescript", "csharp", "scala", "kotlin", "perl", "elixir", "swift" |
| | ] |
| | TOPICS = [ |
| | "backend", "frontend", "production", "testing", "ci", |
| | "ml", "devops", "containers", "docker", "cloud", "microservices" |
| | ] |
| | GENERAL = [ |
| | "dockerfile", "docker container", "docker base image", |
| | "multi stage dockerfile", "dockerfile slim", "dockerfile devcontainer", |
| | "dockerfile ubuntu", "dockerfile alpine", "dockerfile debian" |
| | ] |
| | DEFAULT_QUERIES = [ |
| | "dockerfile python", "dockerfile node", "dockerfile typescript", "dockerfile javascript", |
| | "dockerfile golang", "dockerfile rust", "dockerfile java", "dockerfile kotlin", "dockerfile scala", |
| | "dockerfile php", "dockerfile ruby", "dockerfile csharp", "dockerfile dotnet", "dockerfile flask", |
| | "dockerfile django", "dockerfile fastapi", "dockerfile express", "dockerfile springboot", |
| | "dockerfile react", "dockerfile nextjs", "dockerfile vue", "dockerfile nuxt", "dockerfile svelte", |
| | "dockerfile laravel", "dockerfile symfony", "dockerfile postgres", "dockerfile mysql", |
| | "dockerfile mongodb", "dockerfile redis", "dockerfile nginx", "dockerfile apache", |
| | "dockerfile api", "dockerfile backend", "dockerfile frontend", "dockerfile microservices", |
| | "dockerfile monorepo", "dockerfile tensorflow", "dockerfile pytorch", "dockerfile huggingface", |
| | "dockerfile kubernetes", "dockerfile helm", "dockerfile gitlab", "dockerfile cicd", |
| | "dockerfile openshift", "dockerfile airflow", "dockerfile spark", "dockerfile jupyter", |
| | "dockerfile anaconda", "dockerfile dockerhub", "dockerfile datascience", |
| | "dockerfile databricks", "dockerfile github-actions", "dockerfile codequality" |
| | ] |
| | SPECIAL_QUERIES = [ |
| | "dockerfile base image", "dockerfile ci", "dockerfile cicd", |
| | "dockerfile templates", "dockerfile registry", "dockerfile minimal", |
| | "dockerfile multi-stage", "dockerfile builder", "dockerfile github workflow", |
| | "dockerfile production ready", "dockerfile examples", "dockerfile secure", |
| | "dockerfile dotnet", "dockerfile rust", "dockerfile slim image", |
| | "dockerfile cloud native", "dockerfile init", "dockerfile test image" |
| | ] |
| |
|
| | DEFAULT_OUTPUT_RAW = Path("data/metadata/repos_raw.json") |
| | DEFAULT_OUTPUT_FILTERED = Path("data/metadata/repos_filtered.json") |
| | DEFAULT_POPULAR_REPOS = Path("data/metadata/manual_popular_repos.json") |
| |
|
| | def generate_queries(): |
| | queries = set() |
| | queries.update(GENERAL) |
| | queries.update(DEFAULT_QUERIES) |
| | queries.update(SPECIAL_QUERIES) |
| |
|
| | for lang in LANGUAGES: |
| | for topic in TOPICS: |
| | queries.add(f"dockerfile {lang} {topic}") |
| |
|
| | return sorted(queries) |
| |
|
| | def run_query(query, limit): |
| | print(f"🔍 Szukam: {query}") |
| | result = subprocess.run([ |
| | "gh", "search", "repos", query, |
| | "--limit", str(limit), |
| | "--json", "fullName,description,stargazersCount,updatedAt,createdAt,pushedAt,url" |
| | ], capture_output=True, text=True) |
| |
|
| | if result.returncode != 0: |
| | print(f"❌ Błąd zapytania: {result.stderr.strip()}") |
| | return [] |
| |
|
| | try: |
| | data = json.loads(result.stdout) |
| | if not data: |
| | print(f"⚠️ Brak wyników dla: {query}") |
| | return data |
| | except Exception as e: |
| | print(f"❌ Błąd JSON: {e}") |
| | return [] |
| |
|
| | def deduplicate_and_filter(repos, min_stars, min_date): |
| | seen = set() |
| | filtered = [] |
| | for r in repos: |
| | name = r["fullName"] |
| | updated = datetime.strptime(r["updatedAt"][:10], "%Y-%m-%d") |
| | if name in seen: |
| | continue |
| | if r["stargazersCount"] < min_stars: |
| | continue |
| | if updated < min_date: |
| | continue |
| | seen.add(name) |
| | filtered.append(r) |
| | return filtered |
| |
|
| | def load_manual_popular_repos(path): |
| | if not path.exists(): |
| | print(f"⚠️ Brak pliku: {path}") |
| | return [] |
| |
|
| | with open(path, "r") as f: |
| | try: |
| | data = json.load(f) |
| | enriched = [] |
| | for r in data: |
| | enriched.append({ |
| | "fullName": r["fullName"], |
| | "url": r.get("url", ""), |
| | "description": r.get("description", ""), |
| | "stargazersCount": r.get("stargazersCount", 9999), |
| | "updatedAt": r.get("updatedAt", "2024-01-01T00:00:00Z"), |
| | "createdAt": r.get("createdAt", "2020-01-01T00:00:00Z"), |
| | "pushedAt": r.get("pushedAt", "2024-01-01T00:00:00Z") |
| | }) |
| | return enriched |
| | except Exception as e: |
| | print(f"❌ Błąd wczytywania popularnych repozytoriów: {e}") |
| | return [] |
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--raw_output", type=Path, default=DEFAULT_OUTPUT_RAW) |
| | parser.add_argument("--filtered_output", type=Path, default=DEFAULT_OUTPUT_FILTERED) |
| | parser.add_argument("--queries", type=int, default=-1) |
| | parser.add_argument("--limit", type=int, default=100) |
| | parser.add_argument("--min_stars", type=int, default=5) |
| | parser.add_argument("--min_date", type=str, default="2021-01-01") |
| | parser.add_argument("--refresh", action="store_true") |
| | parser.add_argument("--include_popular", action="store_true") |
| | parser.add_argument("--popular_file", type=Path, default=DEFAULT_POPULAR_REPOS) |
| | args = parser.parse_args() |
| |
|
| | args.raw_output.parent.mkdir(parents=True, exist_ok=True) |
| | args.filtered_output.parent.mkdir(parents=True, exist_ok=True) |
| | min_date = datetime.strptime(args.min_date, "%Y-%m-%d") |
| |
|
| | if args.raw_output.exists() and not args.refresh: |
| | print(f"ℹ️ Plik {args.raw_output} już istnieje. Użyj --refresh, aby nadpisać.") |
| | return |
| |
|
| | all_queries = generate_queries() |
| | queries = all_queries if args.queries == -1 else all_queries[:args.queries] |
| |
|
| | print(f"🧠 Wygenerowano {len(queries)} zapytań:") |
| | for q in queries: |
| | print(" •", q) |
| |
|
| | all_results = [] |
| | for idx, query in enumerate(queries, 1): |
| | print(f"\n🔄 [{idx}/{len(queries)}]") |
| | results = run_query(query, args.limit) |
| | all_results.extend(results) |
| | time.sleep(5) |
| |
|
| | if args.include_popular: |
| | print(f"\n📌 Dodaję popularne repozytoria z pliku: {args.popular_file}") |
| | all_results.extend(load_manual_popular_repos(args.popular_file)) |
| |
|
| | print(f"\n📈 Łącznie zapytań: {len(queries)}") |
| | print(f"📦 Surowych wyników: {len(all_results)}") |
| | with open(args.raw_output, "w") as f: |
| | json.dump(all_results, f, indent=2) |
| |
|
| | clean_repos = deduplicate_and_filter(all_results, args.min_stars, min_date) |
| | with open(args.filtered_output, "w") as f: |
| | json.dump(clean_repos, f, indent=2) |
| |
|
| | print(f"✅ Po filtracji: {len(clean_repos)} repozytoriów") |
| | print(f"📁 Zapisano do: {args.filtered_output}") |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|