diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..c56eac397b010a42c03fad7def81e23c727b7fde --- /dev/null +++ b/LICENSE @@ -0,0 +1,197 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (which shall not include communications that are clearly marked or + otherwise designated in writing by the copyright owner as "Not a Contribution"). + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to use, reproduce, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Work, and to + permit persons to whom the Work is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Work. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, trademark, patent, and + other attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright notice to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. When redistributing + the Work or Derivative Works thereof, You may choose to offer, + and to charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same page as the copyright notice for easier identification within + third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..9e7c24d0c252c3efcbdec7b9645bea505b427e9a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,39 @@ +include README.md +include LICENSE +include pyproject.toml +include MANIFEST.in + +# Include images directory for README.md +recursive-include images * + +# Include package data +recursive-include algorithm *.json +recursive-include algorithm *.yaml +recursive-include algorithm *.yml +recursive-include algorithm *.txt +recursive-include algorithm *.md +recursive-include cli *.json +recursive-include cli *.yaml +recursive-include cli *.yml +recursive-include cli *.txt +recursive-include cli *.md + + +# Include templates and configuration files +recursive-include lf_algorithm/plugins/*/mcp_servers/*/templates.py +recursive-include lf_algorithm/plugins/*/mcp_servers/*/mcp_params.py + +# Exclude development files +global-exclude *.pyc +global-exclude *.pyo +global-exclude __pycache__ +global-exclude .DS_Store +global-exclude *.log +global-exclude .pytest_cache +global-exclude .mypy_cache +global-exclude .venv +global-exclude venv +global-exclude env +global-exclude .env +global-exclude .pypirc +global-exclude .ruff_cache diff --git a/README.md b/README.md index ca0a72acf32f8c6cc6e0d67e3236a79c26ba0acc..ee6f562f3f22ac15681e6b0328fc47188e0e2676 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,207 @@ --- -title: Lineagentic Flow -emoji: ⚡ -colorFrom: blue -colorTo: purple +title: lineagentic-flow +app_file: start_demo_server.py sdk: gradio -sdk_version: 5.42.0 -app_file: app.py -pinned: false +sdk_version: 5.39.0 --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +
+ Lineagentic Logo +
+ +## Lineagentic-flow + +Lineagentic-flow is an agentic ai solution for building end-to-end data lineage across diverse types of data processing scripts across different platforms. It is designed to be modular and customizable, and can be extended to support new data processing script types. In a nutshell this is what it does: + +``` +┌─────────────┐ ┌───────────────────────────────┐ ┌────────────---───┐ +│ source-code │───▶│ lineagentic-flow-algorithm │───▶│ lineage output │ +│ │ │ │ │ │ +└─────────────┘ └───────────────────────────────┘ └──────────────---─┘ +``` +### Features + +- Plugin based design pattern, simple to extend and customize. +- Command line interface for quick analysis. +- Support for multiple data processing script types (SQL, Python, Airflow Spark, etc.) +- Simple demo server to run locally and in huggingface spaces. + +## Quick Start + +### Installation + +Install the package from PyPI: + +```bash +pip install lineagentic-flow +``` + +### Basic Usage + +```python +import asyncio +from lf_algorithm.framework_agent import FrameworkAgent +import logging + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +async def main(): + # Create an agent for SQL lineage extraction + agent = FrameworkAgent( + agent_name="sql-lineage-agent", + model_name="gpt-4o-mini", + source_code="SELECT id, name FROM users WHERE active = true" + ) + + # Run the agent to extract lineage + result = await agent.run_agent() + print(result) + +# Run the example +asyncio.run(main()) +``` +### Supported Agents + +Following table shows the current development agents in Lineagentic-flow algorithm: + + +| **Agent Name** | **Done** | **Under Development** | **In Backlog** | **Comment** | +|----------------------|:--------:|:----------------------:|:--------------:|--------------------------------------| +| python-lineage_agent | ✓ | | | | +| airflow_lineage_agent | ✓ | | | | +| java_lineage_agent | ✓ | | | | +| spark_lineage_agent | ✓ | | | | +| sql_lineage_agent | ✓ | | | | +| flink_lineage_agent | | | ✓ | | +| beam_lineage_agent | | | ✓ | | +| shell_lineage_agent | | | ✓ | | +| scala_lineage_agent | | | ✓ | | +| dbt_lineage_agent | | | ✓ | | + + +### Environment Variables + +Set your API keys: + +```bash +export OPENAI_API_KEY="your-openai-api-key" +export HF_TOKEN="your-huggingface-token" # Optional +``` + +## What are the components of Lineagentic-flow? + +- Algorithm module: This is the brain of the Lineagentic-flow. It contains agents, which are implemented as plugins and acting as chain of thought process to extract lineage from different types of data processing scripts. The module is built using a plugin-based design pattern, allowing you to easily develop and integrate your own custom agents. + +- CLI module: is for command line around algorithm API and connect to unified service layer + +- Demo module: is for teams who want to demo Lineagentic-flow in fast and simple way deployable into huggingface spaces. + +#### Command Line Interface (CLI) + +Lineagentic-flow provides a powerful CLI tool for quick analysis: + +```bash +# Basic SQL query analysis +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT user_id, name FROM users WHERE active = true" --verbose + +# Analyze with lineage configuration +lineagentic analyze --agent-name python-lineage-agent --query-file "my_script.py" --verbose + +``` +for more details see [CLI documentation](cli/README.md). + +### environment variables + +- HF_TOKEN (HUGGINGFACE_TOKEN) +- OPENAI_API_KEY + +### Architecture + +The following figure illustrates the architecture behind the Lineagentic-flow, which is essentially a multi-layer architecture of backend and agentic AI algorithm that leverages a chain-of-thought process to construct lineage across various script types. + +![Architecture Diagram](https://raw.githubusercontent.com/lineagentic/lineagentic-flow/main/images/architecture.png) + + +## Mathematic behind algorithm + +Following shows mathematic behind each layer of algorithm. + +### Agent framework +The agent framework dose IO operations ,memory management, and prompt engineering according to the script type (T) and its content (C). + +$$ +P := f(T, C) +$$ + +## Runtime orchestration agent + +The runtime orchestration agent orchestrates the execution of the required agents provided by the agent framework (P) by selecting the appropriate agent (A) and its corresponding task (T). + +$$ +G=h([\{(A_1, T_1), (A_2, T_2), (A_3, T_3), (A_4, T_4)\}],P) +$$ + +## Syntax Analysis Agent + +Syntax Analysis agent, analyzes the syntactic structure of the raw script to identify subqueries and nested structures and decompose the script into multiple subscripts. + +$$ +\{sa1,⋯,san\}:=h([A_1,T_1],P) +$$ + +## Field Derivation Agent +The Field Derivation agent processes each subscript from syntax analysis agent to derive field-level mapping relationships and processing logic. + +$$ +\{fd1,⋯,fdn\}:=h([A_2,T_2],\{sa1,⋯,san\}) +$$ + +## Operation Tracing Agent +The Operation Tracing agent analyzes the complex conditions within each subscript identified in syntax analysis agent including filter conditions, join conditions, grouping conditions, and sorting conditions. + +$$ +\{ot1,⋯,otn\}:=h([A_3,T_3],\{sa1,⋯,san\}) +$$ + +## Event Composer Agent +The Event Composer agent consolidates the results from the syntax analysis agent, the field derivation agent and the operation tracing agent to generate the final lineage result. + +$$ +\{A\}:=h([A_4,T_4],\{sa1,⋯,san\},\{fd1,⋯,fdn\},\{ot1,⋯,otn\}) +$$ + + + +## Activation and Deployment + +To simplify the usage of Lineagentic-flow, a Makefile has been created to manage various activation and deployment tasks. You can explore the available targets directly within the Makefile. Here you can find different strategies but for more details look into Makefile. + +1- to start demo server: + +```bash +make start-demo-server +``` +2- to do all tests: + +```bash +make test +``` +3- to build package: + +```bash +make build-package +``` +4- to clean all stack: + +```bash +make clean-all-stack +``` + +5- In order to deploy Lineagentic-flow to Hugging Face Spaces, run the following command ( you need to have huggingface account and put secret keys there if you are going to use paid models): + +```bash +make gradio-deploy +``` \ No newline at end of file diff --git a/cli/README.md b/cli/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c9d8e5061b525e2165da8f1a32dcf73cd0a12017 --- /dev/null +++ b/cli/README.md @@ -0,0 +1,167 @@ +# Lineagentic-flow CLI + +A command-line interface for the Lineagentic-flow framework that provides agentic data lineage parsing across various data processing script types. + +## Installation + +The CLI is automatically installed when you install the lineagentic-flow package: + +```bash +pip install -e . +``` + +## Usage + +The CLI provides two main commands: `analyze` and `field-lineage`. + +### Basic Commands + +#### Analyze Query/Code for Lineage +```bash +lineagentic analyze --agent-name sql-lineage-agent --query "your code here" +``` + + +### Running Analysis + +#### Using a Specific Agent +```bash +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT a,b FROM table1" +``` + +#### Using a File as Input +```bash +lineagentic analyze --agent-name python-lineage-agent --query-file path/to/your/script.py +``` + +#### Specifying a Different Model +```bash +lineagentic analyze --agent-name airflow-lineage-agent --model-name gpt-4o --query "your code here" +``` + +#### With Lineage Configuration +```bash +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --job-namespace "my-namespace" --job-name "my-job" +``` + +### Output Options + +#### Pretty Print Results +```bash +lineagentic analyze --agent-name sql --query "your code" --pretty +``` + +#### Save Results to File +```bash +lineagentic analyze --agent-name sql --query "your code" --output results.json +``` + +#### Save Results with Pretty Formatting +```bash +lineagentic analyze --agent-name python --query "your code" --output results.json --pretty +``` + +#### Enable Verbose Output +```bash +lineagentic analyze --agent-name sql --query "your code" --verbose +``` + +## Available Agents + +- **sql-lineage-agent**: Analyzes SQL queries and scripts (default) +- **airflow-lineage-agent**: Analyzes Apache Airflow DAGs and workflows +- **spark-lineage-agent**: Analyzes Apache Spark jobs +- **python-lineage-agent**: Analyzes Python data processing scripts +- **java-lineage-agent**: Analyzes Java data processing code + +## Commands + +### `analyze` Command + +Analyzes a query or code for lineage information. + +#### Required Arguments +- Either `--query` or `--query-file` must be specified + +### Basic Query Analysis +```bash +# Simple SQL query analysis +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT user_id, name FROM users WHERE active = true" + +# Analyze with specific agent +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT a, b FROM table1 JOIN table2 ON table1.id = table2.id" + +# Analyze Python code +lineagentic analyze --agent-name python-lineage-agent --query "import pandas as pd; df = pd.read_csv('data.csv'); result = df.groupby('category').sum()" + +# Analyze Java code +lineagentic analyze --agent-name java-lineage-agent --query "public class DataProcessor { public void processData() { // processing logic } }" + +# Analyze Spark code +lineagentic analyze --agent-name spark-lineage-agent --query "val df = spark.read.csv('data.csv'); val result = df.groupBy('category').agg(sum('value'))" + +# Analyze Airflow DAG +lineagentic analyze --agent-name airflow-lineage-agent --query "from airflow import DAG; from airflow.operators.python import PythonOperator; dag = DAG('my_dag')" +``` + + +### Reading from File +```bash +# Analyze query from file +lineagentic analyze --agent-name sql-lineage-agent --query-file "queries/user_analysis.sql" + +# Analyze Python script from file +lineagentic analyze --agent-name python-lineage-agent --query-file "scripts/data_processing.py" +``` + +### Output Options +```bash +# Save results to file +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --output "results.json" + +# Pretty print results +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --pretty + +# Verbose output +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --verbose + +# Don't save to database +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --no-save + +# Don't save to Neo4j +lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --no-neo4j +``` + + + +## Common Output Options + +Both commands support these output options: + +- `--output`: Output file path for results (JSON format) +- `--pretty`: Pretty print the output +- `--verbose`: Enable verbose output + +## Error Handling + +The CLI provides clear error messages for common issues: + +- Missing required arguments +- File not found errors +- Agent execution errors +- Invalid agent names + +## Development + +To run the CLI in development mode: + +```bash +python -m cli.main --help +``` + +To run a specific command: + +```bash +python -m cli.main analyze --agent-name sql --query "SELECT 1" --pretty +``` + diff --git a/cli/__init__.py b/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..361b1d180b53f5a548163faa2419bd931cfbabaa --- /dev/null +++ b/cli/__init__.py @@ -0,0 +1,5 @@ +""" +CLI package for lineagentic framework. +""" + +__version__ = "0.1.0" \ No newline at end of file diff --git a/cli/main.py b/cli/main.py new file mode 100644 index 0000000000000000000000000000000000000000..b56a8602cb1b005e1b0d7ce17f2267a38d538105 --- /dev/null +++ b/cli/main.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +""" +Main CLI entry point for lineagentic framework. +""" + +import asyncio +import argparse +import sys +import os +import logging +from pathlib import Path + +# Add the project root to the Python path +project_root = Path(__file__).parent.parent +sys.path.insert(0, str(project_root)) + +from lf_algorithm.framework_agent import FrameworkAgent + + +def configure_logging(verbose: bool = False, quiet: bool = False): + """Configure logging for the CLI application.""" + if quiet: + # Quiet mode: only show errors + logging.basicConfig( + level=logging.ERROR, + format='%(levelname)s: %(message)s' + ) + elif verbose: + # Verbose mode: show all logs with detailed format + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + else: + # Normal mode: show only important logs with clean format + logging.basicConfig( + level=logging.WARNING, # Only show warnings and errors by default + format='%(levelname)s: %(message)s' + ) + + # Set specific loggers to INFO level for better user experience + logging.getLogger('lf_algorithm').setLevel(logging.INFO) + logging.getLogger('lf_algorithm.framework_agent').setLevel(logging.INFO) + logging.getLogger('lf_algorithm.agent_manager').setLevel(logging.INFO) + + # Suppress noisy server logs from MCP tools + logging.getLogger('mcp').setLevel(logging.WARNING) + logging.getLogger('agents.mcp').setLevel(logging.WARNING) + logging.getLogger('agents.mcp.server').setLevel(logging.WARNING) + logging.getLogger('agents.mcp.server.stdio').setLevel(logging.WARNING) + logging.getLogger('agents.mcp.server.stdio.stdio').setLevel(logging.WARNING) + + # Suppress MCP library logs specifically + logging.getLogger('mcp.server').setLevel(logging.WARNING) + logging.getLogger('mcp.server.fastmcp').setLevel(logging.WARNING) + logging.getLogger('mcp.server.stdio').setLevel(logging.WARNING) + + # Suppress any logger that contains 'server' in the name + for logger_name in logging.root.manager.loggerDict: + if 'server' in logger_name.lower(): + logging.getLogger(logger_name).setLevel(logging.WARNING) + + # Additional MCP-specific suppressions + logging.getLogger('mcp.server.stdio.stdio').setLevel(logging.WARNING) + logging.getLogger('mcp.server.stdio.stdio.stdio').setLevel(logging.WARNING) + +def create_parser(): + """Create and configure the argument parser.""" + parser = argparse.ArgumentParser( + description="Lineagentic - Agentic approach for code analysis and lineage extraction", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + + lineagentic analyze --agent-name sql-lineage-agent --query "SELECT a,b FROM table1" + lineagentic analyze --agent-name python-lineage-agent --query-file "my_script.py" + """ + ) + + # Create subparsers for the two main operations + subparsers = parser.add_subparsers(dest='command', help='Available commands') + + # Analyze query subparser + analyze_parser = subparsers.add_parser('analyze', help='Analyze code or query for lineage information') + analyze_parser.add_argument( + "--agent-name", + type=str, + default="sql", + help="Name of the agent to use (e.g., sql, airflow, spark, python, java) (default: sql)" + ) + analyze_parser.add_argument( + "--model-name", + type=str, + default="gpt-4o-mini", + help="Model to use for the agents (default: gpt-4o-mini)" + ) + analyze_parser.add_argument( + "--query", + type=str, + help="Code or query to analyze" + ) + analyze_parser.add_argument( + "--query-file", + type=str, + help="Path to file containing the query/code to analyze" + ) + + # Common output options + analyze_parser.add_argument( + "--output", + type=str, + help="Output file path for results (JSON format)" + ) + analyze_parser.add_argument( + "--pretty", + action="store_true", + help="Pretty print the output" + ) + analyze_parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose output with detailed logging" + ) + analyze_parser.add_argument( + "--quiet", + action="store_true", + help="Suppress all output except errors" + ) + + return parser + + +def read_query_file(file_path: str) -> str: + """Read query from a file.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + except FileNotFoundError: + print(f"Error: File '{file_path}' not found.") + sys.exit(1) + except Exception as e: + print(f"Error reading file '{file_path}': {e}") + sys.exit(1) + + + + + +def save_output(result, output_file: str = None, pretty: bool = False): + """Save or print the result.""" + # Convert AgentResult to dict if needed + if hasattr(result, 'to_dict'): + result_dict = result.to_dict() + else: + result_dict = result + + if output_file: + import json + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(result_dict, f, indent=2 if pretty else None) + print(f"Results saved to '{output_file}'") + else: + if pretty: + import json + print("\n" + "="*50) + print("ANALYSIS RESULTS") + print("="*50) + print(json.dumps(result_dict, indent=2)) + print("="*50) + else: + print("\nResults:", result_dict) + + +async def run_analyze_query(args): + """Run analyze_query operation.""" + logger = logging.getLogger(__name__) + + # Get the query + query = args.query + if args.query_file: + query = read_query_file(args.query_file) + + if not query: + logger.error("Either --query or --query-file must be specified.") + sys.exit(1) + + logger.info(f"Running agent '{args.agent_name}' with query...") + + try: + # Create FrameworkAgent instance + agent = FrameworkAgent( + agent_name=args.agent_name, + model_name=args.model_name, + source_code=query + ) + + # Run the agent + result = await agent.run_agent() + + save_output(result, args.output, args.pretty) + + except Exception as e: + logger.error(f"Error running agent '{args.agent_name}': {e}") + sys.exit(1) + + + + + +async def main_async(): + """Main CLI function.""" + parser = create_parser() + args = parser.parse_args() + + # Check if a command was provided + if not args.command: + parser.print_help() + sys.exit(1) + + # Configure logging based on verbosity + configure_logging(verbose=args.verbose, quiet=args.quiet) + + # Run the appropriate command + if args.command == 'analyze': + await run_analyze_query(args) + else: + print(f"Unknown command: {args.command}") + sys.exit(1) + + +def main(): + """Synchronous wrapper for the async main function.""" + asyncio.run(main_async()) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/demo_server.py b/demo_server.py new file mode 100644 index 0000000000000000000000000000000000000000..0512fde975b6b5b575d58fcbbb04c9283c5ac2b9 --- /dev/null +++ b/demo_server.py @@ -0,0 +1,321 @@ +import gradio as gr +import asyncio +import json +import threading +import time +import sys +import os +import logging +from typing import Optional, Dict, Any +from datetime import datetime + +# Import from the published package +from lf_algorithm import FrameworkAgent +from lf_algorithm.utils import write_lineage_log + +# Configure logging for the demo server +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' +) + +class SQLLineageFrontend: + def __init__(self): + self.agent_framework = None + self.current_results = None + self.current_agent_name = None + self.log_thread = None + self.should_stop_logging = False + self.logger = logging.getLogger(__name__) + + def get_visualize_link(self) -> str: + """Generate JSONCrack visualization interface for aggregation data""" + if self.current_results is None: + return """ +
+
📊 Visualization Ready
+
+ After you run analysis and succeed, you need to got to the following link:
+
+ + 🔗 Open editor for simple check and paste the results there + +
+ """ + + try: + # Get the aggregation data - now it's directly the current_results + aggregation_data = self.current_results + + # Handle different result types + if isinstance(aggregation_data, str): + try: + # Try to parse as JSON first + parsed_data = json.loads(aggregation_data) + data_to_encode = parsed_data + except json.JSONDecodeError: + # If it's not valid JSON, wrap it in a dict + data_to_encode = {"aggregation_output": aggregation_data} + elif hasattr(aggregation_data, 'to_dict'): + # Handle AgentResult objects + data_to_encode = aggregation_data.to_dict() + elif isinstance(aggregation_data, dict): + data_to_encode = aggregation_data + else: + # Fallback for other object types + data_to_encode = {"aggregation_output": str(aggregation_data)} + + # Format JSON for display + formatted_json = json.dumps(data_to_encode, indent=2) + + return f""" +
+
+ ✅ Analysis Complete! Ready for Visualization +
+
+ 📋 Steps to visualize your results:
+ 1. Click "Open JSONCrack Editor" below
+ 2. Click "Copy JSON" button or click the JSON data below to select all
+ 3. Paste it into the JSONCrack editor +
+ + 🔗 Open JSONCrack Editor + +

+
+
+
📄 Analysis Results (JSON)
+ +
+ +
+
+ """ + except Exception as e: + return f"
❌ Error generating visualization data: {str(e)}
" + + def get_logs_html(self) -> str: + """Generate HTML for live logs display""" + if self.current_agent_name is None: + return "
No agent initialized yet
" + + return f"""
+
+ 📝 Logging Status for Agent: {self.current_agent_name} +
+
+ ✅ Standard Python Logging Active
+ • All logs are being captured by the application's logging system
+ • Check your console/terminal for real-time log output
+ • Logs include detailed information about agent execution
+ • Structured logging with timestamps and log levels

+ + 📋 Log Types Available:
+ • INFO - General information and progress
+ • DEBUG - Detailed debugging information
+ • WARNING - Warning messages
+ • ERROR - Error messages

+ + 🔍 What You'll See:
+ • Agent initialization and configuration
+ • MCP tool interactions and responses
+ • Analysis progress and completion status
+ • Any errors or warnings during execution +
+
""" + + def test_log_writing(self): + """Test function to write a sample log entry""" + if self.current_agent_name: + try: + write_lineage_log(self.current_agent_name, "test", "Test log entry from frontend") + self.logger.info(f"Test log written successfully for agent: {self.current_agent_name}") + return f"✅ Test log written successfully for agent: {self.current_agent_name}! Check your console output." + except Exception as e: + self.logger.error(f"Failed to write test log: {e}") + return f"❌ Failed to write test log: {e}" + else: + return "⚠️ Please initialize an agent first by running an analysis" + + def get_results_info(self) -> str: + """Get information about the current results""" + if self.current_results is None: + return "No results available yet" + + if isinstance(self.current_results, dict) and "error" in self.current_results: + return f"Error in results: {self.current_results['error']}" + + if hasattr(self.current_results, 'to_dict'): + # AgentResult object + result_dict = self.current_results.to_dict() + inputs_count = len(result_dict.get('inputs', [])) + outputs_count = len(result_dict.get('outputs', [])) + return f"✅ Structured results with {inputs_count} input(s) and {outputs_count} output(s)" + + if isinstance(self.current_results, dict): + return f"✅ Dictionary results with {len(self.current_results)} keys" + + return f"✅ Results type: {type(self.current_results).__name__}" + + async def run_analysis(self, agent_name: str, model_name: str, query: str): + """Run SQL lineage analysis""" + try: + # Validate input + if not query or not query.strip(): + return "❌ Error: Query cannot be empty. Please provide a valid query for analysis." + + self.logger.info(f"Starting analysis with agent: {agent_name}, model: {model_name}") + + # Initialize the agent framework with simplified constructor + self.agent_framework = FrameworkAgent( + agent_name=agent_name, + model_name=model_name, + source_code=query.strip() + ) + self.current_agent_name = agent_name + + self.logger.info(f"Agent framework initialized. Running analysis...") + + # Run the analysis using the structured results method + results = await self.agent_framework.run_agent() + self.current_results = results + + # Check if we got an error response + if isinstance(results, dict) and "error" in results: + self.logger.error(f"Analysis failed: {results['error']}") + return f"❌ Analysis failed: {results['error']}" + + self.logger.info(f"Analysis completed successfully for agent: {agent_name}") + + return f"""✅ Analysis completed successfully! Results are now available in the visualization section. + Click 'Open JSONCrack Editor' to visualize your data lineage. + + If you want to set up your own local development environment or deploy this in production, + please refer to the GitHub repository mentioned above.""" + + except ValueError as ve: + self.logger.error(f"Validation error: {ve}") + return f"❌ Validation error: {str(ve)}" + except Exception as e: + self.logger.error(f"Error running analysis: {e}") + return f"❌ Error running analysis: {str(e)}" + + def run_analysis_sync(self, agent_name: str, model_name: str, query: str): + """Synchronous wrapper for run_analysis""" + return asyncio.run(self.run_analysis(agent_name, model_name, query)) + + def create_ui(self): + """Create the Gradio interface""" + with gr.Blocks(title="SQL Lineage Analysis", fill_width=True) as ui: + + gr.Markdown('
🔍 Demo Lineagentic-Flow
') + gr.Markdown('
Analyze data lineage with AI-powered agents
') + gr.Markdown('
Check out agent types for supporting script types
') + gr.Markdown('
For local and production runs, check out the repo: 🔗 https://github.com/lineagentic/lineagentic-flow
') + + with gr.Row(): + # Left column - Configuration and Query + with gr.Column(scale=1): + gr.Markdown("### 1. Agent Configuration") + agent_dropdown = gr.Dropdown( + label="Agent Type", + choices=[ + "sql-lineage-agent", + "python-lineage-agent", + "airflow-lineage-agent", + "java-lineage-agent", + "spark-lineage-agent" + ], + value="sql-lineage-agent" + ) + model_dropdown = gr.Dropdown( + label="Model", + choices=[ + "gpt-4o-mini", + "gpt-4o", + "deepseek-coder", + "deepseek-chat", + "gemini-pro" + ], + value="gpt-4o-mini" + ) + + gr.Markdown("### 2. Query for Lineage Analysis") + query_input = gr.Textbox( + label="Query", + placeholder="Enter your SQL query here...", + lines=9, + max_lines=15 + ) + + analyze_button = gr.Button("🚀 Run Analysis", variant="primary", size="lg") + status_output = gr.Textbox(label="Status", interactive=False) + + # Right column - Visualization and Logs + with gr.Column(scale=1): + gr.Markdown("### 3. Results Information") + results_info = gr.Textbox( + label="Results Status", + value=self.get_results_info(), + interactive=False + ) + + gr.Markdown("### 4. Visualize Results") + gr.Markdown("📊 After successful analysis, visualize your results in demo editor") + visualize_html = gr.HTML(self.get_visualize_link()) + + gr.Markdown("### 5. Logging Information") + logs_html = gr.HTML(self.get_logs_html()) + test_log_button = gr.Button("Test Log Writing", variant="secondary", size="sm") + + # Auto-refresh logs every 5 seconds + refresh_logs = gr.Button("🔄 Refresh Logs", variant="secondary", size="sm") + refresh_results = gr.Button("🔄 Refresh Results Info", variant="secondary", size="sm") + + # Event handlers + def run_analysis_and_update(agent_name, model_name, query): + """Run analysis and update visualization""" + # Run the analysis + status_result = self.run_analysis_sync(agent_name, model_name, query) + # Update visualization, logs, and results info + viz_html = self.get_visualize_link() + logs_html = self.get_logs_html() + results_info = self.get_results_info() + return status_result, results_info, viz_html, logs_html + + analyze_button.click( + fn=run_analysis_and_update, + inputs=[agent_dropdown, model_dropdown, query_input], + outputs=[status_output, results_info, visualize_html, logs_html] + ) + + test_log_button.click( + fn=self.test_log_writing, + inputs=[], + outputs=[status_output] + ) + + refresh_logs.click( + fn=self.get_logs_html, + inputs=[], + outputs=[logs_html] + ) + + refresh_results.click( + fn=self.get_results_info, + inputs=[], + outputs=[results_info] + ) + + return ui + + def run(self): + """Launch the Gradio interface""" + ui = self.create_ui() + ui.launch(share=False, inbrowser=True) + +if __name__ == "__main__": + frontend = SQLLineageFrontend() + frontend.run() \ No newline at end of file diff --git a/deploy_setup.py b/deploy_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..57029d320455de1b05d214a63c117ee5e8fa5dc6 --- /dev/null +++ b/deploy_setup.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +Deployment setup script for Hugging Face Spaces +This script installs the local package after all files are copied +""" + +import subprocess +import sys +import os + +def install_local_package(): + """Install the local package in editable mode""" + try: + print("📦 Installing local lineagentic-flow package...") + + # First, try to install in editable mode + result = subprocess.run([ + sys.executable, "-m", "pip", "install", "-e", "." + ], capture_output=True, text=True, cwd=os.getcwd()) + + if result.returncode == 0: + print("✅ Local package installed successfully!") + + # Verify that entry points are registered + try: + import importlib.metadata + entry_points = list(importlib.metadata.entry_points(group='lineagentic.lf_algorithm.plugins')) + print(f"✅ Found {len(entry_points)} registered plugins:") + for ep in entry_points: + print(f" - {ep.name}") + return True + except Exception as e: + print(f"⚠️ Warning: Could not verify entry points: {e}") + return True + else: + print(f"❌ Failed to install local package: {result.stderr}") + return False + except Exception as e: + print(f"❌ Error installing local package: {e}") + return False + +if __name__ == "__main__": + install_local_package() diff --git a/lf_algorithm/__init__.py b/lf_algorithm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..99bb41e3f3cc81bc85ee560f3c1ce580608b8c56 --- /dev/null +++ b/lf_algorithm/__init__.py @@ -0,0 +1,46 @@ +# lf_algorithm/__init__.py +import logging + +# Add NullHandler to prevent "No handler could be found" warnings +# This is the only logging configuration the library should do +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +from .framework_agent import FrameworkAgent +from .utils import write_lineage_log +from .utils.file_utils import dump_json_record, read_json_records, clear_json_file, get_file_stats +from .utils.tracers import LogTracer, log_trace_id +from .models.models import AgentResult +from .plugins.sql_lineage_agent.lineage_agent import SqlLineageAgent, create_sql_lineage_agent, get_plugin_info as get_sql_plugin_info +from .plugins.python_lineage_agent.lineage_agent import PythonLineageAgent, create_python_lineage_agent, get_plugin_info as get_python_plugin_info +from .plugins.airflow_lineage_agent.lineage_agent import AirflowLineageAgent, create_airflow_lineage_agent, get_plugin_info as get_airflow_plugin_info +from .plugins.java_lineage_agent.lineage_agent import JavaLineageAgent, create_java_lineage_agent, get_plugin_info as get_java_plugin_info +from .plugins.spark_lineage_agent.lineage_agent import SparkLineageAgent, create_spark_lineage_agent, get_plugin_info as get_spark_plugin_info + +__version__ = "0.1.0" + +__all__ = [ + 'FrameworkAgent', + 'AgentResult', + 'write_lineage_log', + 'dump_json_record', + 'read_json_records', + 'clear_json_file', + 'get_file_stats', + 'LogTracer', + 'log_trace_id', + 'SqlLineageAgent', + 'create_sql_lineage_agent', + 'get_sql_plugin_info', + 'PythonLineageAgent', + 'create_python_lineage_agent', + 'get_python_plugin_info', + 'AirflowLineageAgent', + 'create_airflow_lineage_agent', + 'get_airflow_plugin_info', + 'JavaLineageAgent', + 'create_java_lineage_agent', + 'get_java_plugin_info', + 'SparkLineageAgent', + 'create_spark_lineage_agent', + 'get_spark_plugin_info' +] \ No newline at end of file diff --git a/lf_algorithm/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fc241e2e90c9688f8bea2052d650d2b1b7fac0e Binary files /dev/null and b/lf_algorithm/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/__pycache__/agent_manager.cpython-313.pyc b/lf_algorithm/__pycache__/agent_manager.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df77828dd95fde1c46bf21102bcdd47765be857b Binary files /dev/null and b/lf_algorithm/__pycache__/agent_manager.cpython-313.pyc differ diff --git a/lf_algorithm/__pycache__/framework_agent.cpython-313.pyc b/lf_algorithm/__pycache__/framework_agent.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56c7d413957ef30bed50c261a1591cc96a6f8a53 Binary files /dev/null and b/lf_algorithm/__pycache__/framework_agent.cpython-313.pyc differ diff --git a/lf_algorithm/agent_manager.py b/lf_algorithm/agent_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..a563af1f056b8a5fbea5d27b2e0d76d30bd5c873 --- /dev/null +++ b/lf_algorithm/agent_manager.py @@ -0,0 +1,84 @@ +import importlib.metadata +from typing import Dict, Any, Optional, Type, Callable + +from .utils import get_logger, get_model, validate_api_keys + +logger = get_logger(__name__) + + +class AgentManager: + """Manages plugin discovery and loading for the FrameworkAgent""" + + def __init__(self): + self.agents: Dict[str, Dict[str, Any]] = {} + self.agent_factories: Dict[str, Callable] = {} + self._load_plugins() + # Validate API keys on initialization + validate_api_keys() + + def _load_plugins(self): + """Load all available agents plugins using entry points""" + try: + # Load plugins from the 'lineagentic.lf_algorithm.plugins' entry point group + for entry_point in importlib.metadata.entry_points(group='lineagentic.lf_algorithm.plugins'): + try: + agent_info = entry_point.load() + if callable(agent_info): + # If it's a function, assume it returns plugin info + agent_data = agent_info() + else: + # If it's already a dict/object + agent_data = agent_info + + agent_name = agent_data.get('name', entry_point.name) + self.agents[agent_name] = agent_data + + # Store the factory function if available + if 'factory_function' in agent_data: + self.agent_factories[agent_name] = agent_data['factory_function'] + + logger.info(f"Loaded plugin: {agent_name}") + + except Exception as e: + logger.error(f"Failed to load plugin {entry_point.name}: {e}") + + except Exception as e: + logger.error(f"Error loading plugins: {e}") + + def get_agent(self, agent_name: str) -> Optional[Dict[str, Any]]: + """Get agent information by name""" + return self.agents.get(agent_name) + + def list_agents(self) -> Dict[str, Dict[str, Any]]: + """List all available agents""" + return self.agents.copy() + + def create_agent(self, agent_name: str, **kwargs) -> Any: + """Create an agent instance using the agent's factory function""" + if agent_name not in self.agent_factories: + raise ValueError(f"Agent '{agent_name}' not found or has no factory function") + + factory = self.agent_factories[agent_name] + # Pass the get_model function to the agent factory + kwargs['get_model_func'] = get_model + return factory(agent_name=agent_name, **kwargs) + + def get_supported_operations(self) -> Dict[str, list]: + """Get all supported operations from all agents""" + operations = {} + for agent_name, agent_info in self.agents.items(): + supported_ops = agent_info.get('supported_operations', []) + for op in supported_ops: + if op not in operations: + operations[op] = [] + operations[op].append(agent_name) + return operations + + def get_agents_for_operation(self, operation: str) -> list: + """Get all agents that support a specific operation""" + supported_ops = self.get_supported_operations() + return supported_ops.get(operation, []) + + +# Global agent manager instance +agent_manager = AgentManager() \ No newline at end of file diff --git a/lf_algorithm/framework_agent.py b/lf_algorithm/framework_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac3a4571d1e3b929274fc1e4bd31d1d3c28a929 --- /dev/null +++ b/lf_algorithm/framework_agent.py @@ -0,0 +1,130 @@ +import asyncio +import sys +import os +from typing import Dict, Any, List, Optional, Union +import json +from datetime import datetime +import uuid + +from .utils import get_logger, get_model, validate_api_keys + +logger = get_logger(__name__) + +from .utils.tracers import LogTracer +from .agent_manager import agent_manager +from agents import add_trace_processor +from .models.models import AgentResult + + +class FrameworkAgent: + + def __init__(self, agent_name: str, model_name: str = "gpt-4o-mini", + source_code: str = None): + """ + Initialize the Agent Framework. + + Args: + agent_name (str): The name of the agent to use + model_name (str): The model to use for the agents (default: "gpt-4o-mini") + lineage_config (LineageConfig): Configuration for OpenLineage event metadata + + Raises: + ValueError: If lineage_config is not provided + """ + if not source_code: + raise ValueError("source_code is required and cannot be None") + + self.agent_name = agent_name + self.model_name = model_name + self.source_code = source_code + self.agent_manager = agent_manager + + # Validate API keys on initialization + validate_api_keys() + + logger.info(f"FrameworkAgent initialized: agent_name={agent_name}, model_name={model_name}") + + + + async def run_agent_plugin(self, **kwargs) -> Dict[str, Any]: + """ + Run a specific agent with a source code. + + Args: + **kwargs: Additional arguments to pass to the agent + + Returns: + Dict[str, Any]: The results from the agent with merged OpenLineage metadata + """ + logger.info(f"Starting agent: {self.agent_name} with model: {self.model_name}") + add_trace_processor(LogTracer()) + + try: + # Create the agent using the plugin's factory function + logger.info(f"Creating agent instance for: {self.agent_name}") + agent = self.agent_manager.create_agent( + agent_name=self.agent_name, + source_code=self.source_code, + model_name=self.model_name, + **kwargs + ) + + # Run the agent + logger.info(f"Running agent: {self.agent_name}") + results = await agent.run() + logger.info(f"Agent {self.agent_name} completed successfully") + + return results + + except Exception as e: + logger.error(f"Error running agent {self.agent_name}: {e}") + return {"error": str(e)} + + def map_results_to_objects(self, results: Dict[str, Any]) -> Union[AgentResult, Dict[str, Any]]: + """ + Map JSON results from agent to structured AgentResult objects. + + Args: + results: Dictionary containing the agent results + + Returns: + AgentResult: Structured object representation of the results, or original dict if mapping fails + """ + try: + + # Check if it's an error response + if "error" in results: + return results + + # Check if it has the expected structure for lineage results + if "inputs" in results and "outputs" in results: + return AgentResult.from_dict(results) + + # If it doesn't match the expected structure, return as-is + return results + + except Exception as e: + logger.error(f"Error mapping results to objects: {e}") + return results + + async def run_agent(self, **kwargs) -> Union[AgentResult, Dict[str, Any]]: + """ + Run a specific agent and return structured objects instead of raw dictionaries. + + Args: + **kwargs: Additional arguments to pass to the agent + + Returns: + Union[AgentResult, Dict[str, Any]]: Structured AgentResult object or error dict + """ + logger.info(f"Starting run_agent for {self.agent_name}") + raw_results = await self.run_agent_plugin(**kwargs) + mapped_results = self.map_results_to_objects(raw_results) + logger.info(f"Agent {self.agent_name} completed. Results type: {type(mapped_results)}") + if hasattr(mapped_results, 'to_dict'): + logger.info(f"Mapped results: {mapped_results.to_dict()}") + else: + logger.info(f"Raw results: {mapped_results}") + return mapped_results + + diff --git a/lf_algorithm/models/__pycache__/models.cpython-313.pyc b/lf_algorithm/models/__pycache__/models.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27361769ee84281d5469e35f3eb6b8beeb394e0e Binary files /dev/null and b/lf_algorithm/models/__pycache__/models.cpython-313.pyc differ diff --git a/lf_algorithm/models/models.py b/lf_algorithm/models/models.py new file mode 100644 index 0000000000000000000000000000000000000000..51b0022d9a885cb4b09a9f3cb3dcfa3a3a4d7906 --- /dev/null +++ b/lf_algorithm/models/models.py @@ -0,0 +1,285 @@ +""" +Agent result models for mapping JSON responses from lineage agents. + +This module contains classes for representing the structured results returned +by lineage analysis agents in a type-safe manner. +""" + +from typing import Dict, Any, List, Optional + + +class SchemaField: + """Schema field configuration for agent results""" + + def __init__(self, name: str, type: str, description: str): + self.name = name + self.type = type + self.description = description + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'SchemaField': + """Create SchemaField from dictionary""" + return cls( + name=data.get('name', ''), + type=data.get('type', ''), + description=data.get('description', '') + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + 'name': self.name, + 'type': self.type, + 'description': self.description + } + + +class Schema: + """Schema configuration for agent results""" + + def __init__(self, fields: List[SchemaField]): + self.fields = fields + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'Schema': + """Create Schema from dictionary""" + fields = [SchemaField.from_dict(field) for field in data.get('fields', [])] + return cls(fields=fields) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + 'fields': [field.to_dict() for field in self.fields] + } + + +class Transformation: + """Transformation configuration for column lineage""" + + def __init__(self, type: str, subtype: str, description: str, masking: bool = False): + self.type = type + self.subtype = subtype + self.description = description + self.masking = masking + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'Transformation': + """Create Transformation from dictionary""" + return cls( + type=data.get('type', ''), + subtype=data.get('subtype', ''), + description=data.get('description', ''), + masking=data.get('masking', False) + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + 'type': self.type, + 'subtype': self.subtype, + 'description': self.description, + 'masking': self.masking + } + + +class InputField: + """Input field configuration for column lineage""" + + def __init__(self, namespace: str, name: str, field: str, + transformations: List[Transformation]): + self.namespace = namespace + self.name = name + self.field = field + self.transformations = transformations + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'InputField': + """Create InputField from dictionary""" + transformations = [Transformation.from_dict(t) for t in data.get('transformations', [])] + return cls( + namespace=data.get('namespace', ''), + name=data.get('name', ''), + field=data.get('field', ''), + transformations=transformations + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + 'namespace': self.namespace, + 'name': self.name, + 'field': self.field, + 'transformations': [t.to_dict() for t in self.transformations] + } + + +class ColumnLineageField: + """Column lineage field configuration""" + + def __init__(self, input_fields: List[InputField]): + self.input_fields = input_fields + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'ColumnLineageField': + """Create ColumnLineageField from dictionary""" + input_fields = [InputField.from_dict(field) for field in data.get('inputFields', [])] + return cls(input_fields=input_fields) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + 'inputFields': [field.to_dict() for field in self.input_fields] + } + + +class ColumnLineage: + """Column lineage configuration""" + + def __init__(self, fields: Dict[str, ColumnLineageField]): + self.fields = fields + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'ColumnLineage': + """Create ColumnLineage from dictionary""" + fields = { + field_name: ColumnLineageField.from_dict(field_data) + for field_name, field_data in data.get('fields', {}).items() + } + return cls(fields=fields) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + 'fields': { + field_name: field_data.to_dict() + for field_name, field_data in self.fields.items() + } + } + + +class InputFacets: + """Input facets configuration for agent results""" + + def __init__(self, schema: Optional[Schema] = None): + self.schema = schema + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'InputFacets': + """Create InputFacets from dictionary""" + schema = Schema.from_dict(data.get('schema', {})) if data.get('schema') else None + return cls(schema=schema) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + result = {} + if self.schema: + result['schema'] = self.schema.to_dict() + return result + + +class Input: + """Input configuration for agent results""" + + def __init__(self, namespace: str, name: str, facets: Optional[InputFacets] = None): + self.namespace = namespace + self.name = name + self.facets = facets + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'Input': + """Create Input from dictionary""" + facets = InputFacets.from_dict(data.get('facets', {})) if data.get('facets') else None + return cls( + namespace=data.get('namespace', ''), + name=data.get('name', ''), + facets=facets + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + result = { + 'namespace': self.namespace, + 'name': self.name + } + if self.facets: + result['facets'] = self.facets.to_dict() + return result + + +class OutputFacets: + """Output facets configuration for agent results""" + + def __init__(self, column_lineage: Optional[ColumnLineage] = None): + self.column_lineage = column_lineage + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'OutputFacets': + """Create OutputFacets from dictionary""" + column_lineage = ColumnLineage.from_dict(data.get('columnLineage', {})) if data.get('columnLineage') else None + return cls(column_lineage=column_lineage) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + result = {} + if self.column_lineage: + result['columnLineage'] = self.column_lineage.to_dict() + return result + + +class Output: + """Output configuration for agent results""" + + def __init__(self, namespace: str, name: str, facets: Optional[OutputFacets] = None): + self.namespace = namespace + self.name = name + self.facets = facets + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'Output': + """Create Output from dictionary""" + facets = OutputFacets.from_dict(data.get('facets', {})) if data.get('facets') else None + return cls( + namespace=data.get('namespace', ''), + name=data.get('name', ''), + facets=facets + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + result = { + 'namespace': self.namespace, + 'name': self.name + } + if self.facets: + result['facets'] = self.facets.to_dict() + return result + + +class AgentResult: + """Main result class for agent lineage analysis""" + + def __init__(self, inputs: List[Input], outputs: List[Output]): + self.inputs = inputs + self.outputs = outputs + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'AgentResult': + """Create AgentResult from dictionary""" + inputs = [Input.from_dict(input_data) for input_data in data.get('inputs', [])] + outputs = [Output.from_dict(output_data) for output_data in data.get('outputs', [])] + return cls(inputs=inputs, outputs=outputs) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return { + 'inputs': [input_obj.to_dict() for input_obj in self.inputs], + 'outputs': [output_obj.to_dict() for output_obj in self.outputs] + } + + def __str__(self) -> str: + """String representation""" + return f"AgentResult(inputs={len(self.inputs)}, outputs={len(self.outputs)})" + + def __repr__(self) -> str: + """Detailed string representation""" + return f"AgentResult(inputs={self.inputs}, outputs={self.outputs})" diff --git a/lf_algorithm/plugins/__init__.py b/lf_algorithm/plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7f4a7c394440c468a42c53b2da52fc16eb9f4a1b --- /dev/null +++ b/lf_algorithm/plugins/__init__.py @@ -0,0 +1 @@ +# Plugin system for FrameworkAgent \ No newline at end of file diff --git a/lf_algorithm/plugins/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7607fa9783ab22f367cd28279634192d3604b535 Binary files /dev/null and b/lf_algorithm/plugins/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/airflow_lineage_agent/__init__.py b/lf_algorithm/plugins/airflow_lineage_agent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/lf_algorithm/plugins/airflow_lineage_agent/__init__.py @@ -0,0 +1 @@ + diff --git a/lf_algorithm/plugins/airflow_lineage_agent/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/airflow_lineage_agent/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af74006d9e6300b33a554951358566a1da1a6aee Binary files /dev/null and b/lf_algorithm/plugins/airflow_lineage_agent/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/airflow_lineage_agent/__pycache__/airflow_instructions.cpython-313.pyc b/lf_algorithm/plugins/airflow_lineage_agent/__pycache__/airflow_instructions.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f2c31116701875027d47d23bb12a8252d32852d Binary files /dev/null and b/lf_algorithm/plugins/airflow_lineage_agent/__pycache__/airflow_instructions.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/airflow_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc b/lf_algorithm/plugins/airflow_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6eda189235bf2dbd3452aeb354e7a66cebaa8cc4 Binary files /dev/null and b/lf_algorithm/plugins/airflow_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/airflow_lineage_agent/airflow_instructions.py b/lf_algorithm/plugins/airflow_lineage_agent/airflow_instructions.py new file mode 100644 index 0000000000000000000000000000000000000000..d2ad43a3452887ac23e4cfdf63a26df2c705d961 --- /dev/null +++ b/lf_algorithm/plugins/airflow_lineage_agent/airflow_instructions.py @@ -0,0 +1,98 @@ +def comprehensive_analysis_instructions(name: str): + return f""" + You are the {name} Airflow lineage analysis agent. + + **Your Task:** Perform complete Airflow DAG lineage analysis in a single comprehensive process. + + **Complete Analysis Process:** + + **Step 1: Syntax Analysis** + 1. Call the airflow_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the Airflow DAG structure + 3. Store the syntax analysis results for use in subsequent steps + + **Step 2: Field Derivation** + 1. Call the airflow_lineage_field_derivation() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your field mapping analysis + 3. Follow the MCP tool instructions exactly to analyze field mappings and transformations + 4. Store the field derivation results + + **Step 3: Operation Tracing** + 1. Call the airflow_lineage_operation_tracing() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your operation analysis + 3. Follow the MCP tool instructions exactly to analyze logical operations and operators + 4. Store the operation tracing results + + **Step 4: Event Composition** + 1. Call the airflow_lineage_event_composer() MCP tool to get expert instructions + 2. Combine all previous analysis results (syntax, field derivation, operation tracing) + 3. Follow the MCP tool instructions exactly to compose the final OpenLineage event + 4. Return the complete OpenLineage event + + **Important Guidelines:** + - Each MCP tool contains detailed instructions, examples, and output format requirements + - Follow the MCP tool instructions precisely for each step + - Maintain context between steps - use results from earlier steps to inform later analysis + - Ensure the final output is a complete, properly formatted OpenLineage event + - If any step fails, provide clear error information and stop the process + + **Workflow Summary:** + Syntax Analysis → Field Derivation → Operation Tracing → Event Composition → Final Output + """ + +# Keep the individual instructions for backward compatibility if needed +def syntax_analysis_instructions(name: str): + return f""" + You are the {name} Airflow lineage analysis agent. + + **Your Task:** Analyze the provided Airflow DAG for syntax structure. + + **Process:** + 1. Call the airflow_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the Airflow DAG + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def field_derivation_instructions(name: str): + return f""" + You are the {name} Airflow lineage analysis agent. + + **Your Task:** Analyze field mappings and transformations in the Airflow DAG. + + **Process:** + 1. Call the airflow_lineage_field_derivation() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze field mappings + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def operation_tracing_instructions(name: str): + return f""" + You are the {name} Airflow lineage analysis agent. + + **Your Task:** Analyze logical operations and operators in the Airflow DAG. + + **Process:** + 1. Call the airflow_lineage_operation_tracing() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze logical operations + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def event_composer_instructions(name: str): + return f""" + You are the {name} Airflow lineage analysis agent. + + **Your Task:** Compose OpenLineage events from the provided analysis data. + + **Process:** + 1. Call the airflow_lineage_event_composer() MCP tool to get expert instructions + 2. Follow those instructions exactly to compose the OpenLineage event + 3. Return the event in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ diff --git a/lf_algorithm/plugins/airflow_lineage_agent/lineage_agent.py b/lf_algorithm/plugins/airflow_lineage_agent/lineage_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..edd2832e91c24e092fa136159163fae983345a42 --- /dev/null +++ b/lf_algorithm/plugins/airflow_lineage_agent/lineage_agent.py @@ -0,0 +1,98 @@ +import os +import sys +import logging +from contextlib import AsyncExitStack +from agents import Agent, Tool, Runner, trace +from agents.mcp.server import MCPServerStdio +from typing import Dict, Any, Optional + +from ...utils.tracers import log_trace_id +from ...plugins.airflow_lineage_agent.airflow_instructions import comprehensive_analysis_instructions +from ...plugins.airflow_lineage_agent.mcp_servers.mcp_params import airflow_mcp_server_params +from ...utils.file_utils import dump_json_record + +# Get logger for this module +logger = logging.getLogger(__name__) + +MAX_TURNS = 30 # Increased for comprehensive analysis + + +class AirflowLineageAgent: + """Plugin agent for Airflow lineage analysis""" + + def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None): + self.agent_name = agent_name + self.model_name = model_name + self.source_code = source_code + self.get_model_func = get_model_func + + async def create_agent(self, airflow_mcp_servers) -> Agent: + # Use the passed get_model_func or fall back to the centralized one + if self.get_model_func: + model = self.get_model_func(self.model_name) + else: + from ...utils import get_model + model = get_model(self.model_name) + + agent = Agent( + name=self.agent_name, + instructions=comprehensive_analysis_instructions(self.agent_name), + model=model, + mcp_servers=airflow_mcp_servers, + ) + return agent + + async def run_agent(self, airflow_mcp_servers, source_code: str): + # Create single agent for comprehensive analysis + comprehensive_agent = await self.create_agent(airflow_mcp_servers) + + # Run the complete analysis in one go + result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS) + + # Return the final output + return dump_json_record(self.agent_name, result.final_output) + + async def run_with_mcp_servers(self, source_code: str): + async with AsyncExitStack() as stack: + airflow_mcp_servers = [ + await stack.enter_async_context( + MCPServerStdio(params, client_session_timeout_seconds=120) + ) + for params in airflow_mcp_server_params + ] + return await self.run_agent(airflow_mcp_servers, source_code=source_code) + + async def run_with_trace(self, source_code: str): + trace_name = f"{self.agent_name}-lineage-agent" + trace_id = log_trace_id(f"{self.agent_name.lower()}") + with trace(trace_name, trace_id=trace_id): + return await self.run_with_mcp_servers(source_code=source_code) + + async def run(self): + try: + logger.info(f"Starting Airflow lineage analysis for {self.agent_name}") + result = await self.run_with_trace(self.source_code) + logger.info(f"Completed Airflow lineage analysis for {self.agent_name}") + return result + except Exception as e: + logger.error(f"Error running {self.agent_name}: {e}") + return {"error": str(e)} + + +# Plugin interface functions +def create_airflow_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> AirflowLineageAgent: + """Factory function to create a AirflowLineageAgent instance""" + return AirflowLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func) + + +def get_plugin_info() -> Dict[str, Any]: + """Return plugin metadata""" + return { + "name": "airflow-lineage-agent", + "description": "Airflow lineage analysis agent for parsing and analyzing Airflow queries", + "version": "1.0.0", + "author": "Ali Shamsaddinlou", + "agent_class": AirflowLineageAgent, + "factory_function": create_airflow_lineage_agent, + "supported_operations": ["lineage_analysis"], + } \ No newline at end of file diff --git a/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__init__.py b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bdb4efc334bbd2a5b196dd4f64f9e6e85b4ba6e Binary files /dev/null and b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b70dfd03ef5268b60012a6f66c826f24237fd5d3 Binary files /dev/null and b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/__init__.py b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py new file mode 100644 index 0000000000000000000000000000000000000000..d2743215c28f9c103070bf41beaf55ff855cbb2c --- /dev/null +++ b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py @@ -0,0 +1,55 @@ +import logging + +# Configure logging to suppress verbose output +logging.basicConfig(level=logging.WARNING) +logging.getLogger('mcp').setLevel(logging.WARNING) +logging.getLogger('mcp.server').setLevel(logging.WARNING) + +from mcp.server.fastmcp import FastMCP +from typing import Dict, Any + +mcp = FastMCP("lineage_airflow_server") + +from templates import (airflow_lineage_syntax_analysis as syntax_analysis_template, + airflow_lineage_field_derivation as field_derivation_template, + airflow_lineage_operation_tracing as operation_tracing_template, + airflow_lineage_event_composer as event_composer_template) + +@mcp.tool() +async def airflow_lineage_syntax_analysis() -> Dict[str, Any]: + """Airflow lineage structure and syntax decomposition expert""" + return { + "instructions": syntax_analysis_template(), + "version": "1.0.0", + "capabilities": ["dag_parsing", "task_extraction", "dependency_analysis"] + } + +@mcp.tool() +async def airflow_lineage_field_derivation() -> Dict[str, Any]: + """Field mapping and field derivation expert""" + return { + "instructions": field_derivation_template(), + "version": "1.0.0", + "capabilities": ["field_mapping", "transformation_analysis", "column_lineage"] + } + +@mcp.tool() +async def airflow_lineage_operation_tracing() -> Dict[str, Any]: + """Logical operator analysis and operation tracing expert""" + return { + "instructions": operation_tracing_template(), + "version": "1.0.0", + "capabilities": ["filter_analysis", "join_analysis", "aggregation_tracking"] + } + +@mcp.tool() +async def airflow_lineage_event_composer() -> Dict[str, Any]: + """Event composition and aggregation expert""" + return { + "instructions": event_composer_template(), + "version": "1.0.0", + "capabilities": ["openlineage_generation", "event_composition", "metadata_aggregation"] + } + +if __name__ == "__main__": + mcp.run(transport='stdio') diff --git a/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/templates.py b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/templates.py new file mode 100644 index 0000000000000000000000000000000000000000..4503390c840927e69ac0b98acedc1d7eb24f5073 --- /dev/null +++ b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/templates.py @@ -0,0 +1,777 @@ +from datetime import datetime + + +def airflow_lineage_syntax_analysis(): + return """ + You are an Airflow DAG decomposition expert. Your task is to parse an Airflow DAG Python file and extract a clean breakdown of each task as logical units, including key operators, dependencies, and parameters. + + Instructions: + - Extract complete Airflow tasks (not individual lines). + - Include task_id, operator name, and any important arguments (e.g., sql, bash_command, python_callable). + - Identify upstream/downstream task relationships. + - Do NOT include imports, default_args, or DAG definitions unless they affect task behavior directly. + - For TaskGroups or dynamic mapping, expand each logical unit clearly. + + Output Format (JSON): + { + "tasks": [ + { + "task_id": "", + "operator": "", + "params": { + "key1": "value1", + ... + }, + "upstream": ["", ""], + "downstream": [""] + }, + ... + ] + } + + --- + + Positive Example 1: Basic Bash DAG + + Input: + from airflow import DAG + from airflow.operators.bash import BashOperator + + with DAG('sample_dag') as dag: + t1 = BashOperator(task_id='start', bash_command='echo "start"') + t2 = BashOperator(task_id='process', bash_command='python run_job.py') + t3 = BashOperator(task_id='end', bash_command='echo "done"') + t1 >> t2 >> t3 + + Expected Output: + { + "tasks": [ + { + "task_id": "start", + "operator": "BashOperator", + "params": { "bash_command": "echo \"start\"" }, + "upstream": [], + "downstream": ["process"] + }, + { + "task_id": "process", + "operator": "BashOperator", + "params": { "bash_command": "python run_job.py" }, + "upstream": ["start"], + "downstream": ["end"] + }, + { + "task_id": "end", + "operator": "BashOperator", + "params": { "bash_command": "echo \"done\"" }, + "upstream": ["process"], + "downstream": [] + } + ] + } + + --- + + Positive Example 2: PythonOperator DAG + + Input: + from airflow import DAG + from airflow.operators.python import PythonOperator + + def fetch_data(): + return "data" + + def transform_data(): + return "transformed" + + with DAG('etl_dag') as dag: + extract = PythonOperator(task_id='extract', python_callable=fetch_data) + transform = PythonOperator(task_id='transform', python_callable=transform_data) + extract >> transform + + Expected Output: + { + "tasks": [ + { + "task_id": "extract", + "operator": "PythonOperator", + "params": { "python_callable": "fetch_data" }, + "upstream": [], + "downstream": ["transform"] + }, + { + "task_id": "transform", + "operator": "PythonOperator", + "params": { "python_callable": "transform_data" }, + "upstream": ["extract"], + "downstream": [] + } + ] + } + + --- + + Positive Example 3: Branching with BranchPythonOperator + + Input: + from airflow import DAG + from airflow.operators.python import PythonOperator, BranchPythonOperator + from airflow.operators.dummy import DummyOperator + + def choose_path(): + return "path_a" + + with DAG('branch_dag') as dag: + start = DummyOperator(task_id='start') + branch = BranchPythonOperator(task_id='branch', python_callable=choose_path) + path_a = DummyOperator(task_id='path_a') + path_b = DummyOperator(task_id='path_b') + end = DummyOperator(task_id='end') + + start >> branch >> [path_a, path_b] + [path_a, path_b] >> end + + Expected Output: + { + "tasks": [ + { + "task_id": "start", + "operator": "DummyOperator", + "params": {}, + "upstream": [], + "downstream": ["branch"] + }, + { + "task_id": "branch", + "operator": "BranchPythonOperator", + "params": { "python_callable": "choose_path" }, + "upstream": ["start"], + "downstream": ["path_a", "path_b"] + }, + { + "task_id": "path_a", + "operator": "DummyOperator", + "params": {}, + "upstream": ["branch"], + "downstream": ["end"] + }, + { + "task_id": "path_b", + "operator": "DummyOperator", + "params": {}, + "upstream": ["branch"], + "downstream": ["end"] + }, + { + "task_id": "end", + "operator": "DummyOperator", + "params": {}, + "upstream": ["path_a", "path_b"], + "downstream": [] + } + ] + } + + --- + + Positive Example 4: TaskGroup + + Input: + from airflow import DAG + from airflow.operators.dummy import DummyOperator + from airflow.utils.task_group import TaskGroup + + with DAG('grouped_dag') as dag: + start = DummyOperator(task_id='start') + end = DummyOperator(task_id='end') + + with TaskGroup('transformations') as tg: + t1 = DummyOperator(task_id='clean') + t2 = DummyOperator(task_id='enrich') + t1 >> t2 + + start >> tg >> end + + Expected Output: + { + "tasks": [ + { + "task_id": "start", + "operator": "DummyOperator", + "params": {}, + "upstream": [], + "downstream": ["transformations.clean"] + }, + { + "task_id": "transformations.clean", + "operator": "DummyOperator", + "params": {}, + "upstream": ["start"], + "downstream": ["transformations.enrich"] + }, + { + "task_id": "transformations.enrich", + "operator": "DummyOperator", + "params": {}, + "upstream": ["transformations.clean"], + "downstream": ["end"] + }, + { + "task_id": "end", + "operator": "DummyOperator", + "params": {}, + "upstream": ["transformations.enrich"], + "downstream": [] + } + ] + } + + --- + + Positive Example 5: Dynamic Task Mapping with expand() + + Input: + from airflow import DAG + from airflow.operators.python import PythonOperator + + def greet(name): + print(f"Hello {name}") + + with DAG('dynamic_dag') as dag: + greet_task = PythonOperator.partial( + task_id='greet', + python_callable=greet + ).expand(op_args=[["Alice", "Bob", "Charlie"]]) + + Expected Output: + { + "tasks": [ + { + "task_id": "greet", + "operator": "PythonOperator.expand", + "params": { + "python_callable": "greet", + "op_args": ["Alice", "Bob", "Charlie"] + }, + "upstream": [], + "downstream": [] + } + ] + } + + --- + + Negative Example 1: + + Input: + from airflow import DAG + from airflow.operators.python import PythonOperator + + def fetch(): + return "data" + + with DAG('bad_dag') as dag: + task = PythonOperator(task_id='fetch', python_callable=fetch) + + Incorrect Output: + { + "fetch": "PythonOperator" + } + + Reason: + - The structure is invalid: + - It lacks required `"tasks"` array. + - It omits the `"params"` block. + - It does not specify upstream/downstream relationships. + """ + + + + + +def airflow_lineage_field_derivation(): + return """ + You are an Airflow task field mapping analysis expert. Your task is to analyze each task in an Airflow DAG and determine: + + 1. What input data or fields it depends on. + 2. What transformations it performs. + 3. What output data or fields it produces. + + Instructions: + - Focus on operators like BashOperator, PythonOperator, SQL-related operators, etc. + - Do NOT analyze Airflow scheduling logic or metadata unless it affects lineage. + - For PythonOperators, infer logic from the function if possible. + - For SQL or BashOperators, parse the SQL or script if included. + - Your job is to extract lineage-relevant inputs, transformations, and outputs. + - look into all the operators and their parameters, and infer the inputs, outputs, and transformations. + - if the operator is a PythonOperator, look into the function and infer the inputs, outputs, and transformations. + - if the operator is a SQLOperator, look into the SQL and infer the inputs, outputs, and transformations. + - if the operator is a BashOperator, look into the Bash command and infer the inputs, outputs, and transformations. + - if the operator is a PostgresOperator, look into the SQL and infer the inputs, outputs, and transformations. + - if the operator is a MySQLOperator, look into the SQL and infer the inputs, outputs, and transformations. + - if the operator is a OracleOperator, look into the SQL and infer the inputs, outputs, and transformations. + - if the operator is a SparkOperator, look into the Spark code and infer the inputs, outputs, and transformations. + - if the operator is a HiveOperator, look into the Hive code and infer the inputs, outputs, and transformations. + - if the operator is a KafkaOperator, look into the Kafka code and infer the inputs, outputs, and transformations. + - if the operator is a S3Operator, look into the S3 code and infer the inputs, outputs, and transformations. + - if the operator is a GCSOperator, look into the GCS code and infer the inputs, outputs, and transformations. + - if the operator is a FTPOperator, look into the FTP code and infer the inputs, outputs, and transformations. + - if the operator is a SFTPOperator, look into the SFTP code and infer the inputs, outputs, and transformations. + + Output Format: + [ + { "output_fields": [ { + "namespace": "", + "name": "", + "field": "", + "transformation": "" + } ] }, + ... + ] + + + + Positive Example : + + Input: + from airflow import DAG + from airflow.operators.python import PythonOperator + from datetime import datetime + import pandas as pd + import numpy as np + import shutil + + def fetch_raw_data(): + # Simulate a data pull or raw copy + shutil.copy('/data/source/raw_customers.csv', '/data/input/customers.csv') + + def transform_customer_data(): + df = pd.read_csv('/data/input/customers.csv') + + df['first_name'] = df['first_name'].str.strip().str.title() + df['last_name'] = df['last_name'].str.strip().str.title() + df['full_name'] = df['first_name'] + ' ' + df['last_name'] + + df['birthdate'] = pd.to_datetime(df['birthdate']) + df['age'] = (pd.Timestamp('today') - df['birthdate']).dt.days // 365 + + df['age_group'] = np.where(df['age'] >= 60, 'Senior', + np.where(df['age'] >= 30, 'Adult', 'Young')) + + df = df[df['email'].notnull()] + + df.to_csv('/data/output/cleaned_customers.csv', index=False) + + def load_to_warehouse(): + # Load cleaned data to customers_1 table in database + df = pd.read_csv('/data/output/cleaned_customers.csv') + + # Get database connection + pg_hook = PostgresHook(postgres_conn_id='warehouse_connection') + engine = pg_hook.get_sqlalchemy_engine() + + # Write to customers_1 table + df.to_sql('customers_1', engine, if_exists='replace', index=False) + + print(f"Successfully loaded {len(df)} records to customers_1 table") + + default_args = { + 'start_date': datetime(2025, 8, 1), + } + + with DAG( + dag_id='customer_etl_pipeline_extended', + default_args=default_args, + schedule_interval='@daily', + catchup=False, + tags=['etl', 'example'] + ) as dag: + + ff = PythonOperator( + task_id='fetch_data', + python_callable=fetch_raw_data + ) + + tt = PythonOperator( + task_id='transform_and_clean', + python_callable=transform_customer_data + ) + + ll = PythonOperator( + task_id='load_to_warehouse', + python_callable=load_to_warehouse + ) + + ff >> tt >> ll + + Expected Output: + { + "output_fields": [ + { + "namespace": "default", + "name": "customers.csv", + "field": "first_name", + "transformation": "Strip and title case" + }, + { + "namespace": "default", + "name": "customers.csv", + "field": "last_name", + "transformation": "Strip and title case" + }, + { + "namespace": "default", + "name": "customers.csv", + "field": "full_name", + "transformation": "Concatenation with space" + }, + { + "namespace": "default", + "name": "customers.csv", + "field": "birthdate", + "transformation": "Convert to datetime" + }, + { + "namespace": "default", + "name": "customers.csv", + "field": "age", + "transformation": "Calculate age" + }, + { + "namespace": "default", + "name": "customers.csv", + "field": "age_group", + "transformation": "Group by age" + }, + { + "namespace": "default", + "name": "customers.csv", + "field": "email", + "transformation": "Remove nulls" + } + + ], + } + + + + """ + + + +def airflow_lineage_operation_tracing(): + return """ + You are a logical operator analysis expert for Airflow DAGs. Your task is to inspect each task’s logic and extract the logical operations applied to data fields. This includes: + + - Filters + - Joins (if any SQL is embedded or implied) + - Group by / Having + - Order by + - Other conditional logic (e.g., CASE, EXISTS, .apply filters) + + Instructions: + - Only include fields involved in logic, not all fields. + - Tasks using Python callables or SQL should be parsed and analyzed. + - Bash commands are only considered if they invoke Python/SQL/CLI logic that performs data filtering or selection. + + Output Format: + { + "logical_operators": [ + { + "task_id": "", + "source_fields": ["", "", ...], + "logical_operators": { + "filters": ["..."], + "joins": ["..."], + "group_by": ["..."], + "having": ["..."], + "order_by": ["..."], + "other": ["..."] + } + } + ] + } + + --- + + Positive Example 1: + + Input: + from airflow.operators.postgres_operator import PostgresOperator + + t1 = PostgresOperator( + task_id='filter_active_users', + sql='SELECT id, name FROM users WHERE status = \'active\' ORDER BY name', + postgres_conn_id='analytics_db' + ) + + Expected Output: + { + "logical_operators": [ + { + "task_id": "filter_active_users", + "source_fields": ["status", "name"], + "logical_operators": { + "filters": ["status = 'active'"], + "order_by": ["name"] + } + } + ] + } + + --- + + Positive Example 2: + + Input: + from airflow.operators.python import PythonOperator + + def filter_sales(): + import pandas as pd + df = pd.read_csv("sales.csv") + filtered = df[df["region"] == "EU"] + result = filtered[filtered["amount"] > 1000] + return result + + t2 = PythonOperator( + task_id='filter_sales', + python_callable=filter_sales + ) + + Expected Output: + { + "logical_operators": [ + { + "task_id": "filter_sales", + "source_fields": ["region", "amount"], + "logical_operators": { + "filters": ["df['region'] == 'EU'", "filtered['amount'] > 1000"] + } + } + ] + } + + --- + + Negative Example 1: + + Input: + from airflow.operators.bash import BashOperator + + t3 = BashOperator( + task_id='run_model', + bash_command='python model.py' + ) + + Incorrect Output: + { + "logical_operators": [ + { + "task_id": "run_model", + "source_fields": ["model"], + "logical_operators": { + "filters": ["--use-gpu"] + } + } + ] + } + + Reason: + - BashOperator with a generic script path provides no visible logical operations on data. + - There is no SQL or Python code to analyze for filtering, joining, or grouping. + - No valid field-level logic can be inferred. + """ + + + + +def airflow_lineage_event_composer(): + return """ + You are an OpenLineage lineage generation expert for Apache Airflow DAGs. + + Your job is to take parsed DAG tasks, field mappings, and logical operations, and generate a **single OpenLineage event JSON** representing full lineage across the DAG. + + --- + + ### You will receive: + + 1. **DAG Task Breakdown** (with dependencies, task_ids, operator type, params) + + 2. **Field Mappings** per task: + [ + { + "task_id": "", + "inputs": [...], + "outputs": [...], + "transformations": [...] + } + ] + + 3. **Logical Operators** per task: + [ + { + "task_id": "", + "source_fields": [...], + "logical_operators": { + "filters": [...], + "joins": [...], + "group_by": [...], + "having": [...], + "order_by": [...], + "other": [...] + } + } + ] + + --- + + ### Your Task: + + Generate **one OpenLineage event JSON** that captures the full end-to-end data flow and transformations in the DAG. + + Strictly follow the format below: + + - Do NOT rename, flatten, or restructure any fields or keys. + - Output only the final OpenLineage JSON — no extra text, comments, or explanation. + - `inputs` should represent input **datasets**, not individual fields. + 4. Based on following examples generate , , , for Apache Airflow DAGs and tasks (file-based sources/targets, SQL-based operators, cloud storage operators, in-memory variables): + + Airflow PythonOperator (reads local file) + def _read_file(): + with open("/data/raw/customers.csv") as f: + return f.read() + Expected: + or : default + or : file./data/raw/customers.csv + + Airflow PythonOperator (writes local file) + def _write_file(data): + with open("/data/curated/customers_curated.csv", "w") as f: + f.write(data) + Expected: + : default + : file./data/curated/customers_curated.csv + + Airflow BashOperator (reads S3 file) + bash_command="aws s3 cp s3://datalake/raw/events/2025-08-01.json -" + Expected: + or : default + or : s3./datalake/raw/events/2025-08-01.json + + Airflow BashOperator (writes S3 file) + bash_command="aws s3 cp /tmp/output.json s3://warehouse/gold/output.json" + Expected: + : default + : s3./warehouse/gold/output.json + + Airflow SQL operators (PostgresOperator with schema.table) + sql="SELECT * FROM analytics.orders" + Expected: + or : default + or : analytics.orders + + Airflow SQL operators (BigQueryOperator with project.dataset.table) + sql="SELECT id FROM project123.dataset456.customers" + Expected: + or : project123 + or : dataset456.customers + + Airflow S3ToRedshiftOperator + s3_bucket="datalake", s3_key="bronze/sales.csv", table="analytics.sales" + Expected: + : default + : s3./datalake/bronze/sales.csv + : default + : analytics.sales + + Airflow LocalFilesystemToGCSOperator + src="/tmp/data.json", dst="bronze/data.json" + Expected: + : default + : file./tmp/data.json + : default + : gs./bronze/data.json + + Airflow in-memory XCom variable + ti.xcom_push(key="intermediate_data", value=[1,2,3]) + Expected: + : temp + : intermediate_data + + Airflow XCom read + data = ti.xcom_pull(key="intermediate_data") + Expected: + : temp + : intermediate_data + + Notes: + - Use scheme prefixes for path-like sources/targets: + file./absolute/or/relative/path + s3./bucket/key + gs./bucket/key + abfs./container/path + - For in-memory XComs or Python variables, use: + = temp + = + - For SQL-based operators: + BigQuery: namespace = , name = + Postgres/MySQL: namespace = default, name = + SQL Server: namespace = , name = + - Wherever you can't find information for , , , , , , , , then write "NA". + - Very important: Your output must follow exactly the specified JSON structure — do not output explanations, comments, or anything else. + - very very very important: Your output must follow **exactly** this JSON structure — do not output explanations, comments, or anything else. + + --- + + ### Required Output Format (Example): + { + "inputs": [ + { + "namespace": "", + "name": "", + "facets": { + "schema": { + "fields": [ + { + "name": "", + "type": "", + "description": "" + } + ] + } + } + } + ], + "outputs": [ + { + "namespace": "", + "name": "", + "facets": { + "columnLineage": { + "fields": { + "": { + "inputFields": [ + { + "namespace": "", + "name": "", + "field": "", + "transformations": [ + { + "type": "", + "subtype": "", + "description": "", + "masking": false + } + ] + } + ] + } + } + } + } + } + ] + } + + 4. Return only results in above mentioned json schema format. do not add any text. + """ \ No newline at end of file diff --git a/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_params.py b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_params.py new file mode 100644 index 0000000000000000000000000000000000000000..2dedfc38026c9bd4da6dd42e25d68c3de289ec42 --- /dev/null +++ b/lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_params.py @@ -0,0 +1,9 @@ +import os +from dotenv import load_dotenv + +load_dotenv(override=True) + +# python_lineage_agent mcp server params +airflow_mcp_server_params = [ + {"command": "python", "args": ["lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py"]}, +] diff --git a/lf_algorithm/plugins/java_lineage_agent/__init__.py b/lf_algorithm/plugins/java_lineage_agent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/lf_algorithm/plugins/java_lineage_agent/__init__.py @@ -0,0 +1 @@ + diff --git a/lf_algorithm/plugins/java_lineage_agent/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/java_lineage_agent/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b4fd8eddbc4e838f7297aba2b6fdd589e046da0 Binary files /dev/null and b/lf_algorithm/plugins/java_lineage_agent/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/java_lineage_agent/__pycache__/java_instructions.cpython-313.pyc b/lf_algorithm/plugins/java_lineage_agent/__pycache__/java_instructions.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b2fc10783242fa582b28bca41a6c375c3eb7a8c Binary files /dev/null and b/lf_algorithm/plugins/java_lineage_agent/__pycache__/java_instructions.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/java_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc b/lf_algorithm/plugins/java_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5ee2a47c9484bfcee11cd4091287bfa84d5a70e Binary files /dev/null and b/lf_algorithm/plugins/java_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/java_lineage_agent/java_instructions.py b/lf_algorithm/plugins/java_lineage_agent/java_instructions.py new file mode 100644 index 0000000000000000000000000000000000000000..e58fe7d687c7d131b563258a573fc12ffff989a3 --- /dev/null +++ b/lf_algorithm/plugins/java_lineage_agent/java_instructions.py @@ -0,0 +1,98 @@ +def comprehensive_analysis_instructions(name: str): + return f""" + You are the {name} Java lineage analysis agent. + + **Your Task:** Perform complete Java code lineage analysis in a single comprehensive process. + + **Complete Analysis Process:** + + **Step 1: Syntax Analysis** + 1. Call the java_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the Java code structure + 3. Store the syntax analysis results for use in subsequent steps + + **Step 2: Field Derivation** + 1. Call the java_lineage_field_derivation() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your field mapping analysis + 3. Follow the MCP tool instructions exactly to analyze field mappings and transformations + 4. Store the field derivation results + + **Step 3: Operation Tracing** + 1. Call the java_lineage_operation_tracing() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your operation analysis + 3. Follow the MCP tool instructions exactly to analyze logical operations and operators + 4. Store the operation tracing results + + **Step 4: Event Composition** + 1. Call the java_lineage_event_composer() MCP tool to get expert instructions + 2. Combine all previous analysis results (syntax, field derivation, operation tracing) + 3. Follow the MCP tool instructions exactly to compose the final OpenLineage event + 4. Return the complete OpenLineage event + + **Important Guidelines:** + - Each MCP tool contains detailed instructions, examples, and output format requirements + - Follow the MCP tool instructions precisely for each step + - Maintain context between steps - use results from earlier steps to inform later analysis + - Ensure the final output is a complete, properly formatted OpenLineage event + - If any step fails, provide clear error information and stop the process + + **Workflow Summary:** + Syntax Analysis → Field Derivation → Operation Tracing → Event Composition → Final Output + """ + +# Keep the individual instructions for backward compatibility if needed +def syntax_analysis_instructions(name: str): + return f""" + You are the {name} Java lineage analysis agent. + + **Your Task:** Analyze the provided Java code for syntax structure. + + **Process:** + 1. Call the java_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the Java code + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def field_derivation_instructions(name: str): + return f""" + You are the {name} Java lineage analysis agent. + + **Your Task:** Analyze field mappings and transformations in the Java code. + + **Process:** + 1. Call the java_lineage_field_derivation() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze field mappings + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def operation_tracing_instructions(name: str): + return f""" + You are the {name} Java lineage analysis agent. + + **Your Task:** Analyze logical operations and operators in the Java code. + + **Process:** + 1. Call the java_lineage_operation_tracing() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze logical operations + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def event_composer_instructions(name: str): + return f""" + You are the {name} Java lineage analysis agent. + + **Your Task:** Compose OpenLineage events from the provided analysis data. + + **Process:** + 1. Call the java_lineage_event_composer() MCP tool to get expert instructions + 2. Follow those instructions exactly to compose the OpenLineage event + 3. Return the event in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ diff --git a/lf_algorithm/plugins/java_lineage_agent/lineage_agent.py b/lf_algorithm/plugins/java_lineage_agent/lineage_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..78363e6c016f3c35d05df14031a29ce45fcb64d1 --- /dev/null +++ b/lf_algorithm/plugins/java_lineage_agent/lineage_agent.py @@ -0,0 +1,97 @@ +import os +import sys +import logging +from contextlib import AsyncExitStack +from agents import Agent, Tool, Runner, trace +from agents.mcp.server import MCPServerStdio +from typing import Dict, Any, Optional + +from ...utils.tracers import log_trace_id +from ...plugins.java_lineage_agent.java_instructions import comprehensive_analysis_instructions +from ...plugins.java_lineage_agent.mcp_servers.mcp_params import java_mcp_server_params +from ...utils.file_utils import dump_json_record + +# Get logger for this module +logger = logging.getLogger(__name__) + +MAX_TURNS = 30 # Increased for comprehensive analysis + + +class JavaLineageAgent: + """Plugin agent for Java lineage analysis""" + + def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None): + self.agent_name = agent_name + self.model_name = model_name + self.source_code = source_code + self.get_model_func = get_model_func + + async def create_agent(self, java_mcp_servers) -> Agent: + # Use the passed get_model_func or fall back to the centralized one + if self.get_model_func: + model = self.get_model_func(self.model_name) + else: + from ...utils import get_model + model = get_model(self.model_name) + + agent = Agent( + name=self.agent_name, + instructions=comprehensive_analysis_instructions(self.agent_name), + model=model, + mcp_servers=java_mcp_servers, + ) + return agent + + async def run_agent(self, java_mcp_servers, source_code: str): + # Create single agent for comprehensive analysis + comprehensive_agent = await self.create_agent(java_mcp_servers) + + # Run the complete analysis in one go + result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS) + + # Return the final output + return dump_json_record(self.agent_name, result.final_output) + + async def run_with_mcp_servers(self, source_code: str): + async with AsyncExitStack() as stack: + java_mcp_servers = [ + await stack.enter_async_context( + MCPServerStdio(params, client_session_timeout_seconds=120) + ) + for params in java_mcp_server_params + ] + return await self.run_agent(java_mcp_servers, source_code=source_code) + + async def run_with_trace(self, source_code: str): + trace_name = f"{self.agent_name}-lineage-agent" + trace_id = log_trace_id(f"{self.agent_name.lower()}") + with trace(trace_name, trace_id=trace_id): + return await self.run_with_mcp_servers(source_code=source_code) + + async def run(self): + try: + logger.info(f"Starting Java lineage analysis for {self.agent_name}") + result = await self.run_with_trace(self.source_code) + logger.info(f"Completed Java lineage analysis for {self.agent_name}") + return result + except Exception as e: + logger.error(f"Error running {self.agent_name}: {e}") + return {"error": str(e)} + + +# Plugin interface functions +def create_java_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> JavaLineageAgent: + """Factory function to create a JavaLineageAgent instance""" + return JavaLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func) + + +def get_plugin_info() -> Dict[str, Any]: + """Return plugin metadata""" + return { + "name": "java-lineage-agent", + "description": "Java lineage analysis agent for parsing and analyzing Java queries", + "version": "1.0.0", + "author": "Ali Shamsaddinlou", + "agent_class": JavaLineageAgent, + "factory_function": create_java_lineage_agent, + } \ No newline at end of file diff --git a/lf_algorithm/plugins/java_lineage_agent/mcp_servers/__init__.py b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..def0a9d62df1f55eeed35e59271626b8f578bd73 Binary files /dev/null and b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7bd963cf9a55ed9a3d3173984977ca828a76adc Binary files /dev/null and b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/__init__.py b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py new file mode 100644 index 0000000000000000000000000000000000000000..2a726807d821dc7b8d7cc1e63e825de118a94ea3 --- /dev/null +++ b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py @@ -0,0 +1,55 @@ +import logging + +# Configure logging to suppress verbose output +logging.basicConfig(level=logging.WARNING) +logging.getLogger('mcp').setLevel(logging.WARNING) +logging.getLogger('mcp.server').setLevel(logging.WARNING) + +from mcp.server.fastmcp import FastMCP +from typing import Dict, Any + +mcp = FastMCP("lineage_java_server") + +from templates import (java_lineage_syntax_analysis as syntax_analysis_template, + java_lineage_field_derivation as field_derivation_template, + java_lineage_operation_tracing as operation_tracing_template, + java_lineage_event_composer as event_composer_template) + +@mcp.tool() +async def java_lineage_syntax_analysis() -> Dict[str, Any]: + """Java lineage structure and syntax decomposition expert""" + return { + "instructions": syntax_analysis_template(), + "version": "1.0.0", + "capabilities": ["java_parsing", "method_extraction", "block_analysis"] + } + +@mcp.tool() +async def java_lineage_field_derivation() -> Dict[str, Any]: + """Field mapping and field derivation expert""" + return { + "instructions": field_derivation_template(), + "version": "1.0.0", + "capabilities": ["field_mapping", "transformation_analysis", "column_lineage"] + } + +@mcp.tool() +async def java_lineage_operation_tracing() -> Dict[str, Any]: + """Logical operator analysis and operation tracing expert""" + return { + "instructions": operation_tracing_template(), + "version": "1.0.0", + "capabilities": ["filter_analysis", "stream_analysis", "aggregation_tracking"] + } + +@mcp.tool() +async def java_lineage_event_composer() -> Dict[str, Any]: + """Event composition and aggregation expert""" + return { + "instructions": event_composer_template(), + "version": "1.0.0", + "capabilities": ["openlineage_generation", "event_composition", "metadata_aggregation"] + } + +if __name__ == "__main__": + mcp.run(transport='stdio') diff --git a/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/templates.py b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/templates.py new file mode 100644 index 0000000000000000000000000000000000000000..f8aab74816fade9125f17225d10050e076b75fe5 --- /dev/null +++ b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/templates.py @@ -0,0 +1,605 @@ +from datetime import datetime + + +def java_lineage_syntax_analysis(): + return """ + You are a Java data pipeline decomposition expert. Your task is to analyze complex Java source files and extract discrete, logical transformation blocks. These include data source initialization, filtering, transformation, aggregation, feature derivation, and any computation logic. Each extracted block should be meaningful, self-contained, and independently interpretable. + + Instructions: + - Extract: Complete transformation steps, including data source initialization, filtering, mapping, joining, grouping, calculating, or any pre/postprocessing blocks. + - Do NOT extract single lines unless they represent a standalone logical operation or setup (e.g., reading a file, defining a method, or a full map/filter chain). + - Group tightly related chained operations (e.g., Java Stream chains) into a single transformation unit. + - Preserve entire method definitions or reusable transformation blocks intact. + - Comment lines (// ...) can help guide naming but should not be extracted on their own. + + Output Format (JSON): + { + "sp1": { "name": "", "code": "" }, + "sp2": { "name": "", "code": "" }, + ... + } + + --- + + Positive Example 1: + + Input Java: + import java.nio.file.*; + import java.util.*; + import java.util.stream.*; + + public class DataProcessor { + public static void main(String[] args) throws Exception { + // Load data + List lines = Files.readAllLines(Paths.get("sales.csv")); + + // Parse and clean data + List sales = lines.stream() + .skip(1) + .map(Sale::fromCsv) + .filter(s -> s.getPrice() != null) + .collect(Collectors.toList()); + + // Compute revenue + for (Sale s : sales) { + s.setRevenue(s.getPrice() * s.getQuantity()); + } + + // Filter high revenue + List highRevenue = sales.stream() + .filter(s -> s.getRevenue() > 1000) + .collect(Collectors.toList()); + } + } + + Expected Output: + { + "sp1": { + "name": "load_sales_data_from_csv", + "code": "List lines = Files.readAllLines(Paths.get(\"sales.csv\"));" + }, + "sp2": { + "name": "parse_and_clean_sales_data", + "code": "List sales = lines.stream()\n .skip(1)\n .map(Sale::fromCsv)\n .filter(s -> s.getPrice() != null)\n .collect(Collectors.toList());" + }, + "sp3": { + "name": "compute_revenue_per_sale", + "code": "for (Sale s : sales) {\n s.setRevenue(s.getPrice() * s.getQuantity());\n}" + }, + "sp4": { + "name": "filter_high_revenue_sales", + "code": "List highRevenue = sales.stream()\n .filter(s -> s.getRevenue() > 1000)\n .collect(Collectors.toList());" + } + } + + --- + + Positive Example 2 (with method definition): + + Input Java: + public static List normalize(List values) { + double mean = values.stream().mapToDouble(v -> v).average().orElse(0.0); + double std = Math.sqrt(values.stream().mapToDouble(v -> Math.pow(v - mean, 2)).average().orElse(0.0)); + return values.stream().map(v -> (v - mean) / std).collect(Collectors.toList()); + } + + // In main + List incomes = loadIncomeData(); // Assume loaded + List normalized = normalize(incomes); + + Expected Output: + { + "sp1": { + "name": "define_normalize_method", + "code": "public static List normalize(List values) {\n double mean = values.stream().mapToDouble(v -> v).average().orElse(0.0);\n double std = Math.sqrt(values.stream().mapToDouble(v -> Math.pow(v - mean, 2)).average().orElse(0.0));\n return values.stream().map(v -> (v - mean) / std).collect(Collectors.toList());\n}" + }, + "sp2": { + "name": "load_income_data", + "code": "List incomes = loadIncomeData();" + }, + "sp3": { + "name": "normalize_income_values", + "code": "List normalized = normalize(incomes);" + } + } + + --- + + Negative Example (Too granular): + + { + "sp1": { "name": "skip_header", "code": "lines.stream().skip(1)" }, + "sp2": { "name": "filter_null_price", "code": ".filter(s -> s.getPrice() != null)" } + } + + Reason: These operations are tightly chained and should be grouped into a cohesive transformation step. + """ + + + + +def java_lineage_field_derivation(): + return """ + You are a Java field mapping analysis expert. Given a Java code snippet (typically part of a data transformation pipeline), your job is to extract and explain how each output field or variable is derived. For each, identify: + + 1. The **source field(s)** or variables it depends on + 2. The **transformation logic** applied (e.g., arithmetic operation, aggregation, string manipulation, method call, etc.) + + Output Format: + { + "output_fields": [ + { + "namespace": "", + "name": "", + "field": "", + "transformation": "" + }, + ... + ] + } + + --- + + Positive Example 1: + + Input Java: + read from table employee + Employee employee = new Employee(); + employee.setAnnualSalary(employee.getMonthlySalary() * 12); + + Expected Output: + { + "output_fields": [ + { + "namespace": "default", + "name": "employee", + "field": "monthlySalary", + "transformation": "Multiplied by 12" + } + ] + } + + --- + + Positive Example 2: + + Input Java: + user.setFullName(user.getFirstName().toUpperCase() + " " + user.getLastName()); + + Expected Output: + { + "output_fields": [ + { + "namespace": "default", + "name": "user", + "field": "firstName", + "transformation": "Concatenation with space; UPPER applied to first name" + }, + { + "namespace": "default", + "name": "user", + "field": "lastName", + "transformation": "Concatenation with space; UPPER applied to last name" + } + ] + } + + + + --- + + Negative Example 1 (Incorrect: Unstructured): + + { + "annualSalary": "employee.getMonthlySalary() * 12" + } + + Reason: This is a raw expression and doesn’t explain the transformation clearly or follow the expected schema. + + --- + + Negative Example 2 (Incorrect: Missing logic): + + Input Java: + invoice.setTax(invoice.getIncome() * 0.3); + + Incorrect Output: + { + "output_fields": [ + { + "name": "tax", + "source": "invoice.getIncome()", + "transformation": "Direct" + } + ] + } + + Reason: Transformation logic must describe that it was "Multiplied by 0.3", not just "Direct". + """ + + + +def java_lineage_operation_tracing(): + return """ + You are a Java logical operator analysis expert. Your task is to analyze Java code (typically using Streams, custom filter logic, or data transformation libraries) and extract all **logical operations** applied to data structures such as lists, maps, or custom data models, including: + + - Only list the fields involved in logical operations, not all fields. + - WHERE-like filters (e.g., `.filter()`, `if` conditions inside loops) + - JOIN conditions (e.g., matching fields from two objects) + - GROUP BY and aggregation keys (e.g., `.collect(groupingBy(...))`) + - Filtering after grouping (e.g., filtering a grouped map) + - Sorting operations (e.g., `.sorted(Comparator.comparing(...))`) + - Any logical expressions affecting element selection (e.g., `.anyMatch()`, `Predicate`, custom boolean-returning lambdas) + + Return the result in the following structured format: + + { + "output_fields": [ + { + "source_structure": "", + "source_fields": ["", "", "..."], + "logical_operators": { + "filters": [], + "joins": [], + "group_by": [], + "having": [], + "order_by": [], + "other": [] + } + } + ] + } + + - Only include entries for logical operators if the list is non-empty. + - Represent conditions and expressions fully and clearly. + - Normalize filters and joins (e.g., `e.getAge() > 18`, `emp.getDeptId() == dept.getId()`) + - Include all source collections involved and only the fields used in logical operations. + + --- + + Positive Example 1: + + Input Java: + List filtered = employees.stream() + .filter(e -> e.getRegion().equals("US")) + .collect(Collectors.toList()); + + Map grouped = filtered.stream() + .collect(Collectors.groupingBy(Employee::getCustomerId, Collectors.summingDouble(Employee::getAmount))); + + Map result = grouped.entrySet().stream() + .filter(entry -> entry.getValue() > 1000) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + Expected Output: + { + "output_fields": [ + { + "source_structure": "employees", + "source_fields": ["region", "customerId", "amount"], + "logical_operators": { + "filters": ["e.getRegion().equals(\"US\")", "entry.getValue() > 1000"], + "group_by": ["Employee::getCustomerId"] + } + } + ] + } + + --- + + Positive Example 2: + + Input Java: + List merged = employees.stream() + .flatMap(emp -> departments.stream() + .filter(dept -> emp.getDeptId() == dept.getId()) + .map(dept -> new Merged(emp, dept))) + .collect(Collectors.toList()); + + List active = merged.stream() + .filter(m -> m.getStatus().equals("active")) + .sorted(Comparator.comparing(Merged::getName)) + .collect(Collectors.toList()); + + Expected Output: + { + "output_fields": [ + { + "source_structure": "employees", + "source_fields": ["deptId", "status", "name"], + "logical_operators": { + "joins": ["emp.getDeptId() == dept.getId()"], + "filters": ["m.getStatus().equals(\"active\")"], + "order_by": ["Merged::getName"] + } + }, + { + "source_structure": "departments", + "source_fields": ["id"], + "logical_operators": { + "joins": ["emp.getDeptId() == dept.getId()"] + } + } + ] + } + + --- + + Positive Example 3: + + Input Java: + List flagged = accounts.stream() + .peek(a -> a.setFlag(a.getStatus().equals("closed") ? 1 : 0)) + .collect(Collectors.toList()); + + Expected Output: + { + "output_fields": [ + { + "source_structure": "accounts", + "source_fields": ["status"], + "logical_operators": { + "other": ["a.getStatus().equals(\"closed\") ? 1 : 0"] + } + } + ] + } + + --- + + Negative Example 1 (Incorrect formatting): + + { + "filters": "e.getRegion().equals(\"US\")", + "group_by": "Employee::getCustomerId" + } + + Reason: This structure is flat and omits `source_structure`, `source_fields`, and required nesting under `output_fields`. + + --- + + Negative Example 2 (Missing logical clause): + + Input Java: + List result = users.stream() + .filter(u -> u.getAge() > 18) + .sorted(Comparator.comparing(User::getSignupDate)) + .collect(Collectors.toList()); + + Incorrect Output: + { + "output_fields": [ + { + "source_structure": "users", + "source_fields": ["age"], + "logical_operators": { + "filters": ["u.getAge() > 18"] + } + } + ] + } + + Reason: The `order_by` clause is missing. `signupDate` must be included in `source_fields` and in `order_by`. + """ + + + + +def java_lineage_event_composer(): + return """ + You are an OpenLineage lineage generation expert. + + Your job is to take the outputs from upstream Java data analysis agents and generate a **single, complete OpenLineage event JSON** representing end-to-end data lineage for the transformation pipeline. + + --- + + ### You will receive: + + 1. **Parsed Code Blocks** representing key transformation steps: + { + "sp1": { "name": "load_data", "code": "" }, + "sp2": { "name": "filter_data", "code": "" }, + "sp3": { "name": "compute_result", "code": "" } + } + + 2. **Field Mappings**: one per code block (same order), in this format: + [ + { + "output_fields": [ + { + "name": "", + "source": "", + "transformation": "" + } + ] + }, + ... + ] + + 3. **Logical Operators**: one per code block (same order), in this format: + [ + { + "output_fields": [ + { + "source_structure": "", + "source_fields": ["field1", "field2"], + "logical_operators": { + "filters": ["..."], + "joins": ["..."], + "group_by": ["..."], + "having": ["..."], + "order_by": ["..."], + "other": ["..."] + } + } + ] + }, + ... + ] + + --- + + ### Your Task: + + Generate **one event JSON** that captures the **entire pipeline** from raw source data to final derived outputs. + + Strictly follow the structure below and do not change field names or nesting. It is **very important** to keep the exact same format: + + - Use `"inputs"` and `"outputs"` as array keys (do NOT use `inputDataset` or `outputDataset`) + - Preserve `"facets"` blocks under `"job"`, `"inputs"`, and `"outputs"` + - Include `"columnLineage"` as a facet under `"outputs.facets"` (not at the top level) + - Maintain the exact field names: + - `"eventType"`, `"eventTime"`, `"run"`, `"job"`, `"inputs"`, `"outputs"`, `"facets"`, `"query"`, `"processingType"`, `"integration"`, etc. + 3. you show have all the fields mentioned in following json schema. + 4. Based on following examples generate , , , for Java code patterns (pure Java I/O, JDBC, Hibernate/JPA): + + Pure Java (read file via NIO) + List lines = java.nio.file.Files.readAllLines(java.nio.file.Paths.get("/data/raw/customers.csv")); + Expected: + or : default + or : file./data/raw/customers.csv + + Pure Java (write file) + java.nio.file.Files.write(java.nio.file.Paths.get("/data/curated/sales_curated.csv"), bytes); + Expected: + : default + : file./data/curated/sales_curated.csv + + In-memory collections/objects + List customers = new ArrayList<>(); + Expected: + or : temp + or : customers + + JDBC (PostgreSQL) with explicit schema.table + String sql = "SELECT * FROM analytics.orders"; + try (Connection c = DriverManager.getConnection("jdbc:postgresql://host:5432/db"); + Statement s = c.createStatement(); + ResultSet rs = s.executeQuery(sql)) + Expected: + or : default + or : analytics.orders + + JDBC (MySQL) database.table + String sql = "SELECT u.id, u.email FROM ecommerce.users u"; + try (Connection c = DriverManager.getConnection("jdbc:mysql://host:3306/shop"); + Statement s = c.createStatement(); + ResultSet rs = s.executeQuery(sql)) + Expected: + or : default + or : ecommerce.users + + JDBC (SQL Server) database.schema.table + String sql = "SELECT * FROM sales.dbo.orders"; + try (Connection c = DriverManager.getConnection("jdbc:sqlserver://host;databaseName=sales"); + Statement s = c.createStatement(); + ResultSet rs = s.executeQuery(sql)) + Expected: + or : sales + or : dbo.orders + + JDBC (Oracle) schema.table + String sql = "SELECT * FROM HR.EMPLOYEES"; + try (Connection c = DriverManager.getConnection("jdbc:oracle:thin:@//host:1521/ORCLPDB1"); + Statement s = c.createStatement(); + ResultSet rs = s.executeQuery(sql)) + Expected: + or : default + or : HR.EMPLOYEES + + Hibernate / JPA (Entity with schema) + @Entity + @Table(name = "orders", schema = "sales") + class Order { ... } + Expected: + or : default + or : sales.orders + + Hibernate / JPA (Entity without schema; default schema) + @Entity + @Table(name = "customers") + class Customer { ... } + Expected: + or : default + or : customers + + JDBC write (INSERT into schema.table) + String sql = "INSERT INTO analytics.daily_metrics (run_date, total) VALUES (?, ?)"; + Expected: + : default + : analytics.daily_metrics + + Notes: + - Use scheme prefixes for path-like sources/targets when present: + file./absolute/or/relative/path + s3./bucket/key + gs./bucket/key + abfs./container/path + - For in-memory variables/collections, use: + = temp + = + - For relational sources/targets referenced via SQL, prefer = . If a database/catalog prefix exists (e.g., SQL Server), map it to and keep = . Otherwise use = default. + - Wherever you can't find information for , , , , , , , , then write "NA". + - Very important: Your output must follow exactly the specified JSON structure — do not output explanations, comments, or anything else. + + + - wherever you cant find information for example for , , + , , , , + , , then just write "NA". + + - very very very important: Your output must follow **exactly** this JSON structure — do not output explanations, comments, or anything else. + --- + + ### Required Output Format (Example): + + { + "inputs": [ + { + "namespace": "", + "name": "", + "facets": { + "schema": { + "fields": [ + { + "name": "", + "type": "", + "description": "" + } + ] + } + } + } + ], + "outputs": [ + { + "namespace": "", + "name": "", + "facets": { + "columnLineage": { + "fields": { + "": { + "inputFields": [ + { + "namespace": "", + "name": "", + "field": "", + "transformations": [ + { + "type": "", + "subtype": "", + "description": "", + "masking": false + } + ] + } + ] + } + } + } + } + } + ] + } + + 5. Return only results in above mentioned json schema format. do not add any text. + """ diff --git a/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_params.py b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_params.py new file mode 100644 index 0000000000000000000000000000000000000000..f87ab3f9cf49d1209bc960ac7ce9c84b45b5a8b6 --- /dev/null +++ b/lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_params.py @@ -0,0 +1,9 @@ +import os +from dotenv import load_dotenv + +load_dotenv(override=True) + +# java_lineage_agent mcp server params +java_mcp_server_params = [ + {"command": "python", "args": ["lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py"]}, +] diff --git a/lf_algorithm/plugins/python_lineage_agent/__init__.py b/lf_algorithm/plugins/python_lineage_agent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/lf_algorithm/plugins/python_lineage_agent/__init__.py @@ -0,0 +1 @@ + diff --git a/lf_algorithm/plugins/python_lineage_agent/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/python_lineage_agent/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79b4f6077177ae4ee9d1ea240b8f04a6d3f9f05f Binary files /dev/null and b/lf_algorithm/plugins/python_lineage_agent/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/python_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc b/lf_algorithm/plugins/python_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a813d9da675d166e82d4e3733f9ef87ce6516f0f Binary files /dev/null and b/lf_algorithm/plugins/python_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/python_lineage_agent/__pycache__/python_instructions.cpython-313.pyc b/lf_algorithm/plugins/python_lineage_agent/__pycache__/python_instructions.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa4ce85a1d851221f6be72c5df845b02c1d5490a Binary files /dev/null and b/lf_algorithm/plugins/python_lineage_agent/__pycache__/python_instructions.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/python_lineage_agent/lineage_agent.py b/lf_algorithm/plugins/python_lineage_agent/lineage_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..e58f39d5d2abca3dcbc7707ca44c7fb195465263 --- /dev/null +++ b/lf_algorithm/plugins/python_lineage_agent/lineage_agent.py @@ -0,0 +1,97 @@ +import os +import sys +import logging +from contextlib import AsyncExitStack +from agents import Agent, Tool, Runner, trace +from agents.mcp.server import MCPServerStdio +from typing import Dict, Any, Optional + +from ...utils.tracers import log_trace_id +from ...plugins.python_lineage_agent.python_instructions import comprehensive_analysis_instructions +from ...plugins.python_lineage_agent.mcp_servers.mcp_params import python_mcp_server_params +from ...utils.file_utils import dump_json_record + +# Get logger for this module +logger = logging.getLogger(__name__) + +MAX_TURNS = 30 # Increased for comprehensive analysis + + +class PythonLineageAgent: + """Plugin agent for Python lineage analysis""" + + def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None): + self.agent_name = agent_name + self.model_name = model_name + self.source_code = source_code + self.get_model_func = get_model_func + + async def create_agent(self, python_mcp_servers) -> Agent: + # Use the passed get_model_func or fall back to the centralized one + if self.get_model_func: + model = self.get_model_func(self.model_name) + else: + from ...utils import get_model + model = get_model(self.model_name) + + agent = Agent( + name=self.agent_name, + instructions=comprehensive_analysis_instructions(self.agent_name), + model=model, + mcp_servers=python_mcp_servers, + ) + return agent + + async def run_agent(self, python_mcp_servers, source_code: str): + # Create single agent for comprehensive analysis + comprehensive_agent = await self.create_agent(python_mcp_servers) + + # Run the complete analysis in one go + result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS) + + # Return the final output + return dump_json_record(self.agent_name, result.final_output) + + async def run_with_mcp_servers(self, source_code: str): + async with AsyncExitStack() as stack: + python_mcp_servers = [ + await stack.enter_async_context( + MCPServerStdio(params, client_session_timeout_seconds=120) + ) + for params in python_mcp_server_params + ] + return await self.run_agent(python_mcp_servers, source_code=source_code) + + async def run_with_trace(self, source_code: str): + trace_name = f"{self.agent_name}-lineage-agent" + trace_id = log_trace_id(f"{self.agent_name.lower()}") + with trace(trace_name, trace_id=trace_id): + return await self.run_with_mcp_servers(source_code=source_code) + + async def run(self): + try: + logger.info(f"Starting Python lineage analysis for {self.agent_name}") + result = await self.run_with_trace(self.source_code) + logger.info(f"Completed Python lineage analysis for {self.agent_name}") + return result + except Exception as e: + logger.error(f"Error running {self.agent_name}: {e}") + return {"error": str(e)} + + +# Plugin interface functions +def create_python_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> PythonLineageAgent: + """Factory function to create a PythonLineageAgent instance""" + return PythonLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func) + + +def get_plugin_info() -> Dict[str, Any]: + """Return plugin metadata""" + return { + "name": "python-lineage-agent", + "description": "Python lineage analysis agent for parsing and analyzing Python queries", + "version": "1.0.0", + "author": "Ali Shamsaddinlou", + "agent_class": PythonLineageAgent, + "factory_function": create_python_lineage_agent, + } \ No newline at end of file diff --git a/lf_algorithm/plugins/python_lineage_agent/mcp_servers/__init__.py b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/python_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..641a7ecfd0eb411b5f5d506a3da07e90b26a4dfd Binary files /dev/null and b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/python_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d7f33af7cbb6282fef413f4a8e69add34f86f20 Binary files /dev/null and b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_params.py b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_params.py new file mode 100644 index 0000000000000000000000000000000000000000..5c6b21a904a833853ea63ce9ab1d7db9934aa856 --- /dev/null +++ b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_params.py @@ -0,0 +1,9 @@ +import os +from dotenv import load_dotenv + +load_dotenv(override=True) + +# python_lineage_agent mcp server params +python_mcp_server_params = [ + {"command": "python", "args": ["lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_python_lineage/lineage_python_server.py"]}, +] diff --git a/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_python_lineage/__init__.py b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_python_lineage/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_python_lineage/lineage_python_server.py b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_python_lineage/lineage_python_server.py new file mode 100644 index 0000000000000000000000000000000000000000..5cabe22824e6a5a3e96b5c9e98d7e4dadf78eb22 --- /dev/null +++ b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_python_lineage/lineage_python_server.py @@ -0,0 +1,55 @@ +import logging + +# Configure logging to suppress verbose output +logging.basicConfig(level=logging.WARNING) +logging.getLogger('mcp').setLevel(logging.WARNING) +logging.getLogger('mcp.server').setLevel(logging.WARNING) + +from mcp.server.fastmcp import FastMCP +from typing import Dict, Any + +mcp = FastMCP("lineage_python_server") + +from templates import (python_lineage_syntax_analysis as syntax_analysis_template, + python_lineage_field_derivation as field_derivation_template, + python_lineage_operation_tracing as operation_tracing_template, + python_lineage_event_composer as event_composer_template) + +@mcp.tool() +async def python_lineage_syntax_analysis() -> Dict[str, Any]: + """Python lineage structure and syntax decomposition expert""" + return { + "instructions": syntax_analysis_template(), + "version": "1.0.0", + "capabilities": ["python_parsing", "block_extraction", "transformation_analysis"] + } + +@mcp.tool() +async def python_lineage_field_derivation() -> Dict[str, Any]: + """Field mapping and field derivation expert""" + return { + "instructions": field_derivation_template(), + "version": "1.0.0", + "capabilities": ["field_mapping", "transformation_analysis", "column_lineage"] + } + +@mcp.tool() +async def python_lineage_operation_tracing() -> Dict[str, Any]: + """Logical operator analysis and operation tracing expert""" + return { + "instructions": operation_tracing_template(), + "version": "1.0.0", + "capabilities": ["filter_analysis", "join_analysis", "aggregation_tracking"] + } + +@mcp.tool() +async def python_lineage_event_composer() -> Dict[str, Any]: + """Event composition and aggregation expert""" + return { + "instructions": event_composer_template(), + "version": "1.0.0", + "capabilities": ["openlineage_generation", "event_composition", "metadata_aggregation"] + } + +if __name__ == "__main__": + mcp.run(transport='stdio') diff --git a/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_python_lineage/templates.py b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_python_lineage/templates.py new file mode 100644 index 0000000000000000000000000000000000000000..4feb7f00653e5f1f00c6bde660e0c569a5b9683a --- /dev/null +++ b/lf_algorithm/plugins/python_lineage_agent/mcp_servers/mcp_python_lineage/templates.py @@ -0,0 +1,575 @@ +from datetime import datetime + + +def python_lineage_syntax_analysis(): + return """ + You are a Python data pipeline decomposition expert. Your task is to analyze complex Python scripts and extract discrete, logical transformation steps. These include data loading, cleaning, reshaping, feature engineering, and any computation blocks. Each extracted block should be meaningful, self-contained, and independently interpretable. + + Instructions: + - Extract: complete transformation blocks, including data loading, filtering, joins, groupings, calculations, reshaping, or model-related preprocessing. + - Do NOT extract single lines unless they represent a standalone logical operation or configuration (e.g., reading a file, defining a function, or executing a grouped transformation). + - Group tightly related chained operations (e.g., Pandas method chains) into one unit. + - Preserve function definitions or reusable transformation blocks intact. + - Comment lines (# ...) can help guide naming but should not be extracted on their own. + + Output Format (JSON): + { + "sp1": { "name": "", "code": "" }, + "sp2": { "name": "", "code": "" }, + ... + } + + --- + + Positive Example 1: + + Input Python: + import pandas as pd + + # Load data + df = pd.read_csv('sales.csv') + + # Clean data + df = df.dropna(subset=['price']) + df['price'] = df['price'].astype(float) + + # Add derived columns + df['revenue'] = df['price'] * df['quantity'] + + # Filter high revenue + high_rev = df[df['revenue'] > 1000] + + Expected Output: + { + "sp1": { + "name": "load_sales_data", + "code": "df = pd.read_csv('sales.csv')" + }, + "sp2": { + "name": "clean_missing_and_cast_price", + "code": "df = df.dropna(subset=['price'])\\ndf['price'] = df['price'].astype(float)" + }, + "sp3": { + "name": "add_revenue_column", + "code": "df['revenue'] = df['price'] * df['quantity']" + }, + "sp4": { + "name": "filter_high_revenue_rows", + "code": "high_rev = df[df['revenue'] > 1000]" + } + } + + --- + + Positive Example 2 (with function): + + Input Python: + def normalize_column(df, column): + mean = df[column].mean() + std = df[column].std() + df[column] = (df[column] - mean) / std + return df + + df = pd.read_csv("data.csv") + df = normalize_column(df, "income") + + Expected Output: + { + "sp1": { + "name": "define_normalize_column_function", + "code": "def normalize_column(df, column):\\n mean = df[column].mean()\\n std = df[column].std()\\n df[column] = (df[column] - mean) / std\\n return df" + }, + "sp2": { + "name": "load_data_csv", + "code": "df = pd.read_csv(\\"data.csv\\")" + }, + "sp3": { + "name": "apply_normalization_to_income", + "code": "df = normalize_column(df, \\"income\\")" + } + } + + --- + + Negative Example 1 (Incorrect: Too granular): + + { + "sp1": { "name": "dropna", "code": "df = df.dropna()" }, + "sp2": { "name": "astype_price", "code": "df['price'] = df['price'].astype(float)" } + } + + Reason: These should be grouped if they belong to a single transformation step (e.g., cleaning). + """ + + + + +def python_lineage_field_derivation(): + return """ + You are a Python field mapping analysis expert. Given a Python script or block (typically data transformation code), your job is to extract and explain how each output variable or DataFrame column is derived. For each, identify: + + 1. The **source column(s)** or variables it depends on + 2. The **transformation logic** applied (e.g., arithmetic operation, aggregation, string manipulation, function call, etc.) + + Output Format: + { + "output_fields": [ + { + "namespace": "", + "name": "", + "field": "", + "transformation": "" + }, + ... + ] + } + + --- + + Positive Example 1: + + Input Python: + df = pd.read_csv("monthly_salary.csv", header=True) + df['annual_salary'] = df['monthly_salary'] * 12 + + Expected Output: + { + "output_fields": [ + { + "namespace": "default", + "name": "monthly_salary.csv", + "field": "monthly_salary", + "transformation": "Multiplied by 12" + } + ] + } + + + + --- + + Positive Example 3: + + Input Python: + df = pd.read_csv("sales.csv", header=True) + df['total'] = df['price'] * df['quantity'] + df['discounted'] = df['total'] * 0.9 + + Expected Output: + { + "output_fields": [ + { + "namespace": "default", + "name": "sales.csv", + "field": "price", + "transformation": "Multiplied price by quantity" + }, + { + "namespace": "default", + "name": "sales.csv", + "field": "quantity", + "transformation": "Multiplied by 0.9" + }, + { + "namespace": "default", + "name": "monthly_salary.csv", + "field": "total", + "transformation": "Multiplied total by 0.9" + } + ] + } + + --- + + Negative Example 1 (Incorrect: Unstructured): + + { + "annual_salary": "df['monthly_salary'] * 12" + } + + Reason: This is a raw expression and doesn’t explain the transformation clearly or follow the expected schema. + + --- + + Negative Example 2 (Incorrect: Missing logic): + + Input Python: + df['tax'] = df['income'] * 0.3 + + Incorrect Output: + { + "output_fields": [ + { + "name": "tax", + "source": "df['income']", + "transformation": "Direct" + } + ] + } + + Reason: Transformation logic must describe that it was "Multiplied by 0.3", not just "Direct". + """ + + +def python_lineage_operation_tracing(): + return """ + You are a logical operator analysis expert. Your task is to analyze a Python script (typically using Pandas) and extract all **logical operations** applied to DataFrames and their fields, including: + + - Only list the fields involved in logical operations, not all fields. + - WHERE-like filters (e.g., boolean indexing, `.query()`) + - JOINs or `.merge()` conditions + - GROUP BY and aggregation keys + - Filtering after groupby (`.filter()`, conditional aggregation) + - Sorting operations (`.sort_values()`) + - Any logical expressions affecting row selection (e.g., `.isin()`, `.apply()` returning booleans, `.where()`) + + Return the result in the following structured format: + + { + "output_fields": [ + { + "source_dataframe": "", + "source_fields": ["", "", "..."], + "logical_operators": { + "filters": [], + "joins": [], + "group_by": [], + "having": [], + "order_by": [], + "other": [] + } + } + ] + } + + - Only include entries for logical operators if the list is non-empty. + - Represent conditions and expressions fully and clearly. + - Normalize filters and joins (e.g., `df['col'] > 100`, `df1['id'] == df2['id']`) + - Include all source DataFrames involved and only the fields used in logical operations. + + --- + + Positive Example 1: + + Input Python: + df = pd.read_csv("sales.csv") + filtered = df[df["region"] == "US"] + grouped = filtered.groupby("customer_id").agg({"amount": "sum"}) + result = grouped[grouped["amount"] > 1000] + + Expected Output: + { + "output_fields": [ + { + "source_dataframe": "df", + "source_fields": ["region", "customer_id", "amount"], + "logical_operators": { + "filters": ["df['region'] == 'US'", "grouped['amount'] > 1000"], + "group_by": ["customer_id"] + } + } + ] + } + + --- + + Positive Example 2: + + Input Python: + merged = pd.merge(employees, departments, left_on="dept_id", right_on="id") + active = merged[merged["status"] == "active"] + sorted_df = active.sort_values("name") + + Expected Output: + { + "output_fields": [ + { + "source_dataframe": "employees", + "source_fields": ["dept_id", "status", "name"], + "logical_operators": { + "joins": ["employees['dept_id'] == departments['id']"], + "filters": ["merged['status'] == 'active'"], + "order_by": ["name"] + } + }, + { + "source_dataframe": "departments", + "source_fields": ["id"], + "logical_operators": { + "joins": ["employees['dept_id'] == departments['id']"] + } + } + ] + } + + --- + + Positive Example 3: + + Input Python: + df = pd.read_csv("accounts.csv") + df["flag"] = df["status"].apply(lambda x: 1 if x == "closed" else 0) + + Expected Output: + { + "output_fields": [ + { + "source_dataframe": "df", + "source_fields": ["status"], + "logical_operators": { + "other": ["lambda x: 1 if x == 'closed' else 0"] + } + } + ] + } + + --- + + Negative Example 1 (Incorrect formatting): + + { + "filters": "df['region'] == 'US'", + "group_by": "customer_id" + } + + Reason: This structure is flat and omits `source_dataframe`, `source_fields`, and required list nesting under `output_fields`. + + --- + + Negative Example 2 (Missing logical clause): + + Input Python: + df = users[users["age"] > 18].sort_values("signup_date") + + Incorrect Output: + { + "output_fields": [ + { + "source_dataframe": "users", + "source_fields": ["age"], + "logical_operators": { + "filters": ["users['age'] > 18"] + } + } + ] + } + + Reason: The `order_by` clause is missing. `signup_date` must be included in `source_fields` and in `order_by`. + """ + + + + +def python_lineage_event_composer(): + return """ + You are an OpenLineage lineage generation expert. + + Your job is to take the outputs from upstream Python data analysis agents and generate a **single, complete OpenLineage event JSON** representing end-to-end data lineage for the transformation pipeline. + + --- + + ### You will receive: + + 1. **Parsed Code Blocks** representing key transformation steps: + { + "sp1": { "name": "load_data", "code": "" }, + "sp2": { "name": "filter_data", "code": "" }, + "sp3": { "name": "compute_result", "code": "" } + } + + 2. **Field Mappings**: one per code block (same order), in this format: + [ + { + "output_fields": [ + { + "name": "", + "source": "", + "transformation": "" + } + ] + }, + ... + ] + + 3. **Logical Operators**: one per code block (same order), in this format: + [ + { + "output_fields": [ + { + "source_dataframe": "", + "source_fields": ["field1", "field2"], + "logical_operators": { + "filters": ["..."], + "joins": ["..."], + "group_by": ["..."], + "having": ["..."], + "order_by": ["..."], + "other": ["..."] + } + } + ] + }, + ... + ] + + --- + + ### Your Task: + + Generate **one event JSON** that captures the **entire pipeline** from raw source data to final derived outputs. + + Strictly follow the structure below and do not change field names or nesting. It is **very important** to keep the exact same format: + + - Use `"inputs"` and `"outputs"` as array keys (do NOT use `inputDataset` or `outputDataset`) + - Preserve `"facets"` blocks under `"job"`, `"inputs"`, and `"outputs"` + - Include `"columnLineage"` as a facet under `"outputs.facets"` (not at the top level) + - Maintain the exact field names: + - `"eventType"`, `"eventTime"`, `"run"`, `"job"`, `"inputs"`, `"outputs"`, `"facets"`, `"query"`, `"processingType"`, `"integration"`, etc. + 4. Based on following examples generate , , , for Python code patterns (pure Python, pandas, NumPy, SQLAlchemy): + + Pure Python (files via built-ins) + with open("/data/raw/customers.json") as f: data = json.load(f) + Expected: + or : default + or : file./data/raw/customers.json + + Pure Python (in-memory objects) + customers = [{"id": 1, "name": "A"}] + Expected: + or : temp + or : customers + + pandas: read_csv from local path + df = pd.read_csv("/data/raw/sales.csv") + Expected: + or : default + or : file./data/raw/sales.csv + + pandas: read_parquet from cloud (S3) + df = pd.read_parquet("s3://datalake/bronze/events/2025-08-01.parquet") + Expected: + or : default + or : s3./datalake/bronze/events/2025-08-01.parquet + + pandas: in-memory DataFrame (from dict/list) + df = pd.DataFrame([{"id":1,"total":9.5}]) + Expected: + or : temp + or : df + + pandas: read_sql via SQLAlchemy/Postgres + df = pd.read_sql("SELECT * FROM analytics.orders", con) + Expected: + or : default + or : analytics.orders + + NumPy: load from .npy + arr = np.load("/models/embeddings.npy") + Expected: + or : default + or : file./models/embeddings.npy + + NumPy: in-memory array + arr = np.arange(10) + Expected: + or : temp + or : arr + + SQLAlchemy Core: Postgres table reference + stmt = sa.select(sa.text("id"), sa.text("total")).select_from(sa.text("sales.orders")) + Expected: + or : default + or : sales.orders + + SQLAlchemy Core: SQLite file database + engine = sa.create_engine("sqlite:////tmp/app.db") + df = pd.read_sql("SELECT * FROM customers", engine) + Expected: + or : default + or : customers + + pandas: write to CSV (output) + df.to_csv("/data/curated/sales_curated.csv", index=False) + Expected: + : default + : file./data/curated/sales_curated.csv + + pandas: write to Parquet on S3 (output) + df.to_parquet("s3://warehouse/gold/orders/2025-08-01.parquet") + Expected: + : default + : s3./warehouse/gold/orders/2025-08-01.parquet + + pandas: to_sql into schema.table (output) + df.to_sql("daily_metrics", con, schema="analytics", if_exists="replace", index=False) + Expected: + : default + : analytics.daily_metrics + + Notes: + - Use scheme prefixes for path-like sources/targets: + file./absolute/or/relative/path + s3./bucket/key + gs./bucket/key + abfs./container/path + - For in-memory variables (pure Python, pandas, NumPy), use: + = temp + = + - When reading/writing via SQL (pandas.read_sql / to_sql / SQLAlchemy), prefer = if schema is present; otherwise = . + - Wherever you can't find information for , , , , , , , , then write "NA". + - Very important: Your output must follow exactly the specified JSON structure — do not output explanations, comments, or anything else. + - very very very important: Your output must follow **exactly** this JSON structure — do not output explanations, comments, or anything else. + --- + + ### Required Output Format (Example): + + { + "inputs": [ + { + "namespace": "", + "name": "", + "facets": { + "schema": { + "fields": [ + { + "name": "", + "type": "", + "description": "" + } + ] + } + } + } + ], + "outputs": [ + { + "namespace": "", + "name": "", + "facets": { + "columnLineage": { + "fields": { + "": { + "inputFields": [ + { + "namespace": "", + "name": "", + "field": "", + "transformations": [ + { + "type": "", + "subtype": "", + "description": "", + "masking": false + } + ] + } + ] + } + } + } + } + } + ] + } + + 6. Return only results in above mentioned json schema format. do not add any text.""" diff --git a/lf_algorithm/plugins/python_lineage_agent/python_instructions.py b/lf_algorithm/plugins/python_lineage_agent/python_instructions.py new file mode 100644 index 0000000000000000000000000000000000000000..dee2e0d328f22017780841ad87d5d70b1bc2fee7 --- /dev/null +++ b/lf_algorithm/plugins/python_lineage_agent/python_instructions.py @@ -0,0 +1,99 @@ +def comprehensive_analysis_instructions(name: str): + return f""" + You are the {name} Python lineage analysis agent. + + **Your Task:** Perform complete Python script lineage analysis in a single comprehensive process. + + **Complete Analysis Process:** + + **Step 1: Syntax Analysis** + 1. Call the python_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the Python script structure + 3. Store the syntax analysis results for use in subsequent steps + + **Step 2: Field Derivation** + 1. Call the python_lineage_field_derivation() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your field mapping analysis + 3. Follow the MCP tool instructions exactly to analyze field mappings and transformations + 4. Store the field derivation results + + **Step 3: Operation Tracing** + 1. Call the python_lineage_operation_tracing() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your operation analysis + 3. Follow the MCP tool instructions exactly to analyze logical operations and operators + 4. Store the operation tracing results + + **Step 4: Event Composition** + 1. Call the python_lineage_event_composer() MCP tool to get expert instructions + 2. Combine all previous analysis results (syntax, field derivation, operation tracing) + 3. Follow the MCP tool instructions exactly to compose the final OpenLineage event + 4. Return the complete OpenLineage event + + **Important Guidelines:** + - Each MCP tool contains detailed instructions, examples, and output format requirements + - Follow the MCP tool instructions precisely for each step + - Maintain context between steps - use results from earlier steps to inform later analysis + - Ensure the final output is a complete, properly formatted OpenLineage event + - If any step fails, provide clear error information and stop the process + + **Workflow Summary:** + Syntax Analysis → Field Derivation → Operation Tracing → Event Composition → Final Output + """ + +# Keep the individual instructions for backward compatibility if needed +def syntax_analysis_instructions(name: str): + return f""" + You are the {name} Python lineage analysis agent. + + **Your Task:** Analyze the provided Python script for syntax structure. + + **Process:** + 1. Call the python_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the Python script + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def field_derivation_instructions(name: str): + return f""" + You are the {name} Python lineage analysis agent. + + **Your Task:** Analyze field mappings and transformations in the Python script. + + **Process:** + 1. Call the python_lineage_field_derivation() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze field mappings + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def operation_tracing_instructions(name: str): + return f""" + You are the {name} Python lineage analysis agent. + + **Your Task:** Analyze logical operations and operators in the Python script. + + **Process:** + 1. Call the python_lineage_operation_tracing() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze logical operations + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def event_composer_instructions(name: str): + return f""" + You are the {name} Python lineage analysis agent. + + **Your Task:** Compose OpenLineage events from the provided analysis data. + + **Process:** + 1. Call the python_lineage_event_composer() MCP tool to get expert instructions + 2. Follow those instructions exactly to compose the OpenLineage event + 3. Return the event in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + \ No newline at end of file diff --git a/lf_algorithm/plugins/spark_lineage_agent/__init__.py b/lf_algorithm/plugins/spark_lineage_agent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/lf_algorithm/plugins/spark_lineage_agent/__init__.py @@ -0,0 +1 @@ + diff --git a/lf_algorithm/plugins/spark_lineage_agent/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/spark_lineage_agent/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5f128e79e2a356bb87a7c54a7603a96d752e014 Binary files /dev/null and b/lf_algorithm/plugins/spark_lineage_agent/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/spark_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc b/lf_algorithm/plugins/spark_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c50ca5785049a25c4f0d5e52e820db5d94a8552 Binary files /dev/null and b/lf_algorithm/plugins/spark_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/spark_lineage_agent/__pycache__/spark_instructions.cpython-313.pyc b/lf_algorithm/plugins/spark_lineage_agent/__pycache__/spark_instructions.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e37f67fc769f7bebaf2946f50bfb35f5377a19b5 Binary files /dev/null and b/lf_algorithm/plugins/spark_lineage_agent/__pycache__/spark_instructions.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/spark_lineage_agent/lineage_agent.py b/lf_algorithm/plugins/spark_lineage_agent/lineage_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..88dd43db7ac3eb7360b68eb69259559c8661331f --- /dev/null +++ b/lf_algorithm/plugins/spark_lineage_agent/lineage_agent.py @@ -0,0 +1,98 @@ +import os +import sys +import logging +from contextlib import AsyncExitStack +from agents import Agent, Tool, Runner, trace +from agents.mcp.server import MCPServerStdio +from typing import Dict, Any, Optional + +from ...utils.tracers import log_trace_id +from ...plugins.spark_lineage_agent.spark_instructions import comprehensive_analysis_instructions +from ...plugins.spark_lineage_agent.mcp_servers.mcp_params import spark_mcp_server_params +from ...utils.file_utils import dump_json_record + +# Get logger for this module +logger = logging.getLogger(__name__) + +MAX_TURNS = 30 # Increased for comprehensive analysis + + +class SparkLineageAgent: + """Plugin agent for Spark lineage analysis""" + + def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None): + self.agent_name = agent_name + self.model_name = model_name + self.source_code = source_code + self.get_model_func = get_model_func + + async def create_agent(self, spark_mcp_servers) -> Agent: + # Use the passed get_model_func or fall back to the centralized one + if self.get_model_func: + model = self.get_model_func(self.model_name) + else: + from ...utils import get_model + model = get_model(self.model_name) + + agent = Agent( + name=self.agent_name, + instructions=comprehensive_analysis_instructions(self.agent_name), + model=model, + mcp_servers=spark_mcp_servers, + ) + return agent + + async def run_agent(self, spark_mcp_servers, source_code: str): + # Create single agent for comprehensive analysis + comprehensive_agent = await self.create_agent(spark_mcp_servers) + + # Run the complete analysis in one go + result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS) + + # Return the final output + return dump_json_record(self.agent_name, result.final_output) + + async def run_with_mcp_servers(self, source_code: str): + async with AsyncExitStack() as stack: + spark_mcp_servers = [ + await stack.enter_async_context( + MCPServerStdio(params, client_session_timeout_seconds=120) + ) + for params in spark_mcp_server_params + ] + return await self.run_agent(spark_mcp_servers, source_code=source_code) + + async def run_with_trace(self, source_code: str): + trace_name = f"{self.agent_name}-lineage-agent" + trace_id = log_trace_id(f"{self.agent_name.lower()}") + with trace(trace_name, trace_id=trace_id): + return await self.run_with_mcp_servers(source_code=source_code) + + async def run(self): + try: + logger.info(f"Starting Spark lineage analysis for {self.agent_name}") + result = await self.run_with_trace(self.source_code) + logger.info(f"Completed Spark lineage analysis for {self.agent_name}") + return result + except Exception as e: + logger.error(f"Error running {self.agent_name}: {e}") + return {"error": str(e)} + + +# Plugin interface functions +def create_spark_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> SparkLineageAgent: + """Factory function to create a SparkLineageAgent instance""" + return SparkLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func) + + +def get_plugin_info() -> Dict[str, Any]: + """Return plugin metadata""" + return { + "name": "spark-lineage-agent", + "description": "Spark lineage analysis agent for parsing and analyzing Spark queries", + "version": "1.0.0", + "author": "Ali Shamsaddinlou", + "agent_class": SparkLineageAgent, + "factory_function": create_spark_lineage_agent, + "supported_operations": ["lineage_analysis"], + } \ No newline at end of file diff --git a/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/__init__.py b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbe396b1a2ccb668309067d2e5e4d0eaffdbfcbe Binary files /dev/null and b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1bb3aa10659151768701bc38c2309a5f31389f3 Binary files /dev/null and b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_params.py b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_params.py new file mode 100644 index 0000000000000000000000000000000000000000..dff6827f3565c9882e46ebf3c52c3cd32b3cf072 --- /dev/null +++ b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_params.py @@ -0,0 +1,9 @@ +import os +from dotenv import load_dotenv + +load_dotenv(override=True) + +# spark_lineage_agent mcp server params +spark_mcp_server_params = [ + {"command": "python", "args": ["lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_spark_lineage/lineage_spark_server.py"]}, +] diff --git a/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_spark_lineage/__init__.py b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_spark_lineage/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_spark_lineage/lineage_spark_server.py b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_spark_lineage/lineage_spark_server.py new file mode 100644 index 0000000000000000000000000000000000000000..db4d365757bdfbf5e6d8ccda6624f70dffc8f72a --- /dev/null +++ b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_spark_lineage/lineage_spark_server.py @@ -0,0 +1,55 @@ +import logging + +# Configure logging to suppress verbose output +logging.basicConfig(level=logging.WARNING) +logging.getLogger('mcp').setLevel(logging.WARNING) +logging.getLogger('mcp.server').setLevel(logging.WARNING) + +from mcp.server.fastmcp import FastMCP +from typing import Dict, Any + +mcp = FastMCP("lineage_spark_server") + +from templates import (spark_lineage_syntax_analysis as syntax_analysis_template, + spark_lineage_field_derivation as field_derivation_template, + spark_lineage_operation_tracing as operation_tracing_template, + spark_lineage_event_composer as event_composer_template) + +@mcp.tool() +async def spark_lineage_syntax_analysis() -> Dict[str, Any]: + """Spark lineage structure and syntax decomposition expert""" + return { + "instructions": syntax_analysis_template(), + "version": "1.0.0", + "capabilities": ["spark_parsing", "block_extraction", "transformation_analysis"] + } + +@mcp.tool() +async def spark_lineage_field_derivation() -> Dict[str, Any]: + """Field mapping and field derivation expert""" + return { + "instructions": field_derivation_template(), + "version": "1.0.0", + "capabilities": ["field_mapping", "transformation_analysis", "column_lineage"] + } + +@mcp.tool() +async def spark_lineage_operation_tracing() -> Dict[str, Any]: + """Logical operator analysis and operation tracing expert""" + return { + "instructions": operation_tracing_template(), + "version": "1.0.0", + "capabilities": ["filter_analysis", "join_analysis", "aggregation_tracking"] + } + +@mcp.tool() +async def spark_lineage_event_composer() -> Dict[str, Any]: + """Event composition and aggregation expert""" + return { + "instructions": event_composer_template(), + "version": "1.0.0", + "capabilities": ["openlineage_generation", "event_composition", "metadata_aggregation"] + } + +if __name__ == "__main__": + mcp.run(transport='stdio') diff --git a/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_spark_lineage/templates.py b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_spark_lineage/templates.py new file mode 100644 index 0000000000000000000000000000000000000000..c6750023e80fa773247ec1c5d440a32d7bb8cce1 --- /dev/null +++ b/lf_algorithm/plugins/spark_lineage_agent/mcp_servers/mcp_spark_lineage/templates.py @@ -0,0 +1,501 @@ +from datetime import datetime + + +def spark_lineage_syntax_analysis(): + return """ + You are a Spark data pipeline decomposition expert. Your task is to analyze complex Spark scripts (in Java or Python) and extract discrete, logical transformation steps. These include data loading, cleaning, reshaping, feature engineering, and computation blocks. Each extracted block should be meaningful, self-contained, and independently interpretable. + + Instructions: + - Extract: complete transformation blocks, including data reading, filtering, joins, aggregations, calculations, reshaping, or model-related preprocessing. + - Do NOT extract single lines unless they represent a standalone logical operation or configuration (e.g., reading a file, defining a function, or executing a grouped transformation). + - Group tightly related chained operations (e.g., DataFrame transformations) into one unit. + - Preserve function definitions or reusable transformation blocks intact. + - Comment lines (// or #) can help guide naming but should not be extracted on their own. + + Output Format (JSON): + { + "sp1": { "name": "", "code": "" }, + "sp2": { "name": "", "code": "" }, + ... + } + + --- + + Positive Example 1 (Java Spark): + + Input: + ```java + // Load data + Dataset sales = spark.read().option("header", "true").csv("sales.csv"); + + // Clean data + sales = sales.na().drop(new String[]{"price"}) + .withColumn("price", sales.col("price").cast("double")); + + // Add revenue column + sales = sales.withColumn("revenue", sales.col("price").multiply(sales.col("quantity"))); + + // Filter high revenue + Dataset highRev = sales.filter(sales.col("revenue").gt(1000)); + ``` + --- + Expected Output: + { + "sp1": { + "name": "load_sales_data", + "code": "Dataset sales = spark.read().option(\"header\", \"true\").csv(\"sales.csv\");" + }, + "sp2": { + "name": "clean_missing_and_cast_price", + "code": "sales = sales.na().drop(new String[]{\"price\"})\n .withColumn(\"price\", sales.col(\"price\").cast(\"double\"));" + }, + "sp3": { + "name": "add_revenue_column", + "code": "sales = sales.withColumn(\"revenue\", sales.col(\"price\").multiply(sales.col(\"quantity\")));" + }, + "sp4": { + "name": "filter_high_revenue_rows", + "code": "Dataset highRev = sales.filter(sales.col(\"revenue\").gt(1000));" + } + } + + --- + + Positive Example 2 (with function): + + # Load data + df = spark.read.csv('sales.csv', header=True) + + # Clean data + df = df.dropna(subset=['price']) + df = df.withColumn('price', df['price'].cast('double')) + + # Add revenue column + df = df.withColumn('revenue', df['price'] * df['quantity']) + + # Filter high revenue + high_rev = df.filter(df['revenue'] > 1000) + + + Expected Output: + { + "sp1": { + "name": "load_sales_data", + "code": "df = spark.read.csv('sales.csv', header=True)" + }, + "sp2": { + "name": "clean_missing_and_cast_price", + "code": "df = df.dropna(subset=['price'])\ndf = df.withColumn('price', df['price'].cast('double'))" + }, + "sp3": { + "name": "add_revenue_column", + "code": "df = df.withColumn('revenue', df['price'] * df['quantity'])" + }, + "sp4": { + "name": "filter_high_revenue_rows", + "code": "high_rev = df.filter(df['revenue'] > 1000)" + } + } + + --- + + Negative Example 1 (Incorrect: Too granular): + + df = df.dropna(subset=['price']) + df = df.withColumn('price', df['price'].cast('double')) + + + Incorrect Output: + { + "sp1": { + "name": "drop_null_prices", + "code": "df = df.dropna(subset=['price'])" + }, + "sp2": { + "name": "cast_price_column", + "code": "df = df.withColumn('price', df['price'].cast('double'))" + } + } + + Reason: These two lines belong to the same logical transformation step (data cleaning), and should be grouped into one block. Correct behavior would group them under a single sp key. + """ + + + + +def spark_lineage_field_derivation(): + return """ + You are a PySpark field mapping analysis expert. Given a PySpark script or block (typically data transformation code using `withColumn`, `select`, or similar), your job is to extract and explain how each output column is derived. For each, identify: + + 1. The **source column(s)** it depends on + 2. The **transformation logic** applied (e.g., arithmetic operation, aggregation, string manipulation, function call, etc.) + + { + "output_fields": [ + { + "namespace": "", + "name": "", + "field": "", + "transformation": "" + }, + ... + ] + } + + --- + + Positive Example 1: + + Input PySpark: + df = spark.read.csv("monthly_salary.csv", header=True) + df = df.withColumn("annual_salary", col("monthly_salary") * 12) + + Expected Output: + { + "output_fields": [ + { + "namespace": "default", + "name": "monthly_salary.csv", + "field": "monthly_salary", + "transformation": "Multiplied by 12" + } + ] + } + + --- + + Negative Example 1 (Incorrect: Unstructured): + + { + "annual_salary": "col('monthly_salary') * 12" + } + + Reason: This is a raw expression and doesn’t explain the transformation clearly or follow the expected schema. + + --- + + Negative Example 2 (Incorrect: Missing logic): + + Input PySpark: + df = spark.read.csv("income.csv", header=True) + df = df.withColumn("tax", col("income") * 0.3) + + Incorrect Output: + { + "output_fields": [ + { + "namespace": "default", + "name": "income.csv", + "field": "income", + "transformation": "Direct" + } + ] + } + + Reason: Transformation logic must describe that it was "Multiplied by 0.3", not just "Direct". + """ + + + +def spark_lineage_operation_tracing(): + return """ + You are a logical operator analysis expert. Your task is to analyze a PySpark script and extract all **logical operations** applied to DataFrames and their fields, including: + + - Only list the fields involved in logical operations, not all fields. + - WHERE-like filters (e.g., `.filter()`, `.where()`) + - JOIN conditions (`.join()` with `on`, `how`) + - GROUP BY and aggregation keys + - Filtering after groupBy (`.filter()`, conditional aggregation) + - Sorting operations (`.orderBy()`) + - Any logical expressions affecting row selection (e.g., `.isin()`, `.when()`, `.udf()` returning booleans) + + Return the result in the following structured format: + + { + "output_fields": [ + { + "source_dataframe": "", + "source_fields": ["", "", "..."], + "logical_operators": { + "filters": [], + "joins": [], + "group_by": [], + "having": [], + "order_by": [], + "other": [] + } + } + ] + } + + - Only include entries for logical operators if the list is non-empty. + - Represent conditions and expressions fully and clearly. + - Normalize filters and joins (e.g., `df['col'] > 100`, `df1['id'] == df2['id']`) + - Include all source DataFrames involved and only the fields used in logical operations. + + --- + + Positive Example 1: + + Input PySpark: + df = spark.read.csv("sales.csv", header=True, inferSchema=True) + filtered = df.filter(col("region") == "US") + grouped = filtered.groupBy("customer_id").agg(sum("amount").alias("total")) + result = grouped.filter(col("total") > 1000) + + Expected Output: + { + "output_fields": [ + { + "source_dataframe": "df", + "source_fields": ["region", "customer_id", "amount"], + "logical_operators": { + "filters": ["df['region'] == 'US'", "grouped['total'] > 1000"], + "group_by": ["customer_id"] + } + } + ] + } + + --- + + Positive Example 2: + + Input PySpark: + merged = employees.join(departments, employees.dept_id == departments.id, "inner") + active = merged.filter(col("status") == "active") + sorted_df = active.orderBy("name") + + Expected Output: + { + "output_fields": [ + { + "source_dataframe": "employees", + "source_fields": ["dept_id", "status", "name"], + "logical_operators": { + "joins": ["employees['dept_id'] == departments['id']"], + "filters": ["merged['status'] == 'active'"], + "order_by": ["name"] + } + }, + { + "source_dataframe": "departments", + "source_fields": ["id"], + "logical_operators": { + "joins": ["employees['dept_id'] == departments['id']"] + } + } + ] + } + + --- + + Positive Example 3: + + Input PySpark: + df = spark.read.csv("accounts.csv", header=True) + df = df.withColumn("flag", when(col("status") == "closed", 1).otherwise(0)) + + Expected Output: + { + "output_fields": [ + { + "source_dataframe": "df", + "source_fields": ["status"], + "logical_operators": { + "other": ["when(df['status'] == 'closed', 1).otherwise(0)"] + } + } + ] + } + + --- + + Negative Example 1 (Incorrect formatting): + + { + "filters": "df['region'] == 'US'", + "group_by": "customer_id" + } + + Reason: This structure is flat and omits `source_dataframe`, `source_fields`, and required list nesting under `output_fields`. + + --- + + Negative Example 2 (Missing logical clause): + + Input PySpark: + df = users.filter(col("age") > 18).orderBy("signup_date") + + Incorrect Output: + { + "output_fields": [ + { + "source_dataframe": "users", + "source_fields": ["age"], + "logical_operators": { + "filters": ["users['age'] > 18"] + } + } + ] + } + + Reason: The `order_by` clause is missing. `signup_date` must be included in `source_fields` and in `order_by`. + """ + + + + +def spark_lineage_event_composer(): + return """ + You are an OpenLineage lineage generation expert. + + Your job is to take the outputs from upstream PySpark data analysis agents and generate a **single, complete OpenLineage event JSON** representing end-to-end data lineage for the transformation pipeline. + + --- + + ### You will receive: + + 1. **Parsed Code Blocks** representing key transformation steps: + { + "sp1": { "name": "load_data", "code": "" }, + "sp2": { "name": "filter_data", "code": "" }, + "sp3": { "name": "compute_result", "code": "" } + } + + 2. **Field Mappings**: one per code block (same order), in this format: + [ + { + "output_fields": [ + { + "name": "", + "source": "", + "transformation": "" + } + ] + }, + ... + ] + + 3. **Logical Operators**: one per code block (same order), in this format: + [ + { + "output_fields": [ + { + "source_dataframe": "", + "source_fields": ["field1", "field2"], + "logical_operators": { + "filters": ["..."], + "joins": ["..."], + "group_by": ["..."], + "having": ["..."], + "order_by": ["..."], + "other": ["..."] + } + } + ] + }, + ... + ] + + --- + + ### Your Task: + + Generate **one event JSON** that captures the **entire pipeline** from raw source data to final derived outputs. + + Strictly follow the structure below and do not change field names or nesting. It is **very important** to keep the exact same format: + + - Use `"inputs"` and `"outputs"` as array keys (do NOT use `inputDataset` or `outputDataset`) + - Preserve `"facets"` blocks under `"job"`, `"inputs"`, and `"outputs"` + - Include `"columnLineage"` as a facet under `"outputs.facets"` (not at the top level) + - Maintain the exact field names: + 3. Match the structure and nesting exactly as in this format + 4.Based on following example generate , , , : + Examples: + Spark (Unity Catalog: catalog.schema.table) + SELECT id FROM main.analytics.events; + Expected: + or : main + or : analytics.events + + Spark (Hive Metastore / no catalog: database.table) + SELECT * FROM sales.orders; + Expected: + or : default + or : sales.orders + + Spark temporary views (temp.view or global_temp.view) + SELECT * FROM temp.session_orders; + Expected: + or : temp + or : session_orders + + Spark path-based tables (Delta/Parquet/CSV via table-valued functions) + SELECT * FROM delta.`/mnt/data/events`; + Expected: + or : default + or : delta./mnt/data/events + + - wherever you cant find information for example for , , + , , , , + , , then just write "NA". + + + 4-wherever you cant find information for example for , , + , , , , + , , then just write "NA". + + - very very very important: Your output must follow **exactly** this JSON structure — do not output explanations, comments, or anything else. + --- + + ### Required Output Format (Example): + { + "inputs": [ + { + "namespace": "", + "name": "", + "facets": { + "schema": { + "fields": [ + { + "name": "", + "type": "", + "description": "" + } + ] + } + } + } + ], + "outputs": [ + { + "namespace": "", + "name": "", + "facets": { + "columnLineage": { + "fields": { + "": { + "inputFields": [ + { + "namespace": "", + "name": "", + "field": "", + "transformations": [ + { + "type": "", + "subtype": "", + "description": "", + "masking": false + } + ] + } + ] + } + } + } + } + } + ] + } + """ diff --git a/lf_algorithm/plugins/spark_lineage_agent/spark_instructions.py b/lf_algorithm/plugins/spark_lineage_agent/spark_instructions.py new file mode 100644 index 0000000000000000000000000000000000000000..9be6abb8d5e408d736ef487f82ffb25b48f724b2 --- /dev/null +++ b/lf_algorithm/plugins/spark_lineage_agent/spark_instructions.py @@ -0,0 +1,97 @@ +def syntax_analysis_instructions(name: str): + return f""" + You are the {name} Spark lineage analysis agent. + + **Your Task:** Analyze the provided Spark script for syntax structure. + + **Process:** + 1. Call the spark_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the Spark script + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def field_derivation_instructions(name: str): + return f""" + You are the {name} Spark lineage analysis agent. + + **Your Task:** Analyze field mappings and transformations in the Spark script. + + **Process:** + 1. Call the spark_lineage_field_derivation() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze field mappings + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def operation_tracing_instructions(name: str): + return f""" + You are the {name} Spark lineage analysis agent. + + **Your Task:** Analyze logical operations and operators in the Spark script. + + **Process:** + 1. Call the spark_lineage_operation_tracing() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze logical operations + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def event_composer_instructions(name: str): + return f""" + You are the {name} Spark lineage analysis agent. + + **Your Task:** Compose OpenLineage events from the provided analysis data. + + **Process:** + 1. Call the spark_lineage_event_composer() MCP tool to get expert instructions + 2. Follow those instructions exactly to compose the OpenLineage event + 3. Return the event in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def comprehensive_analysis_instructions(name: str): + return f""" + You are the {name} Spark lineage analysis agent. + + **Your Task:** Perform complete Spark lineage analysis in a single comprehensive process. + + **Complete Analysis Process:** + + **Step 1: Syntax Analysis** + 1. Call the spark_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the Spark script structure + 3. Store the syntax analysis results for use in subsequent steps + + **Step 2: Field Derivation** + 1. Call the spark_lineage_field_derivation() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your field mapping analysis + 3. Follow the MCP tool instructions exactly to analyze field mappings and transformations + 4. Store the field derivation results + + **Step 3: Operation Tracing** + 1. Call the spark_lineage_operation_tracing() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your operation analysis + 3. Follow the MCP tool instructions exactly to analyze logical operations and operators + 4. Store the operation tracing results + + **Step 4: Event Composition** + 1. Call the spark_lineage_event_composer() MCP tool to get expert instructions + 2. Combine all previous analysis results (syntax, field derivation, operation tracing) + 3. Follow the MCP tool instructions exactly to compose the final OpenLineage event + 4. Return the complete OpenLineage event + + **Important Guidelines:** + - Each MCP tool contains detailed instructions, examples, and output format requirements + - Follow the MCP tool instructions precisely for each step + - Maintain context between steps - use results from earlier steps to inform later analysis + - Ensure the final output is a complete, properly formatted OpenLineage event + - If any step fails, provide clear error information and stop the process + + **Workflow Summary:** + Syntax Analysis → Field Derivation → Operation Tracing → Event Composition → Final Output + """ \ No newline at end of file diff --git a/lf_algorithm/plugins/sql_lineage_agent/__init__.py b/lf_algorithm/plugins/sql_lineage_agent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/lf_algorithm/plugins/sql_lineage_agent/__init__.py @@ -0,0 +1 @@ + diff --git a/lf_algorithm/plugins/sql_lineage_agent/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/sql_lineage_agent/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df243ab9f416c15ea5a21eee90e218737a2b4ed2 Binary files /dev/null and b/lf_algorithm/plugins/sql_lineage_agent/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/sql_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc b/lf_algorithm/plugins/sql_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..506f99414050a5fd728f7a91262ab046edeed141 Binary files /dev/null and b/lf_algorithm/plugins/sql_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/sql_lineage_agent/__pycache__/sql_instructions.cpython-313.pyc b/lf_algorithm/plugins/sql_lineage_agent/__pycache__/sql_instructions.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebad952c6734f8c136faea1ad81ee97fc2d1a667 Binary files /dev/null and b/lf_algorithm/plugins/sql_lineage_agent/__pycache__/sql_instructions.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/sql_lineage_agent/lineage_agent.py b/lf_algorithm/plugins/sql_lineage_agent/lineage_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..02f8b02941c5b6ba8133069d7dd5c6364e842d50 --- /dev/null +++ b/lf_algorithm/plugins/sql_lineage_agent/lineage_agent.py @@ -0,0 +1,98 @@ +import os +import sys +import logging +from contextlib import AsyncExitStack +from agents import Agent, Tool, Runner, trace +from agents.mcp.server import MCPServerStdio +from typing import Dict, Any, Optional + +from ...utils.tracers import log_trace_id +from ...plugins.sql_lineage_agent.sql_instructions import comprehensive_analysis_instructions +from ...plugins.sql_lineage_agent.mcp_servers.mcp_params import sql_mcp_server_params +from ...utils.file_utils import dump_json_record + +# Get logger for this module +logger = logging.getLogger(__name__) + +MAX_TURNS = 30 # Increased for comprehensive analysis + + +class SqlLineageAgent: + """Plugin agent for SQL lineage analysis""" + + def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None): + self.agent_name = agent_name + self.model_name = model_name + self.source_code = source_code + self.get_model_func = get_model_func + + async def create_agent(self, sql_mcp_servers) -> Agent: + # Use the passed get_model_func or fall back to the centralized one + if self.get_model_func: + model = self.get_model_func(self.model_name) + else: + from ...utils import get_model + model = get_model(self.model_name) + + agent = Agent( + name=self.agent_name, + instructions=comprehensive_analysis_instructions(self.agent_name), + model=model, + mcp_servers=sql_mcp_servers, + ) + return agent + + async def run_agent(self, sql_mcp_servers, source_code: str): + # Create single agent for comprehensive analysis + comprehensive_agent = await self.create_agent(sql_mcp_servers) + + # Run the complete analysis in one go + result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS) + + # Return the final output + return dump_json_record(self.agent_name, result.final_output) + + async def run_with_mcp_servers(self, source_code: str): + async with AsyncExitStack() as stack: + sql_mcp_servers = [ + await stack.enter_async_context( + MCPServerStdio(params, client_session_timeout_seconds=120) + ) + for params in sql_mcp_server_params + ] + return await self.run_agent(sql_mcp_servers, source_code=source_code) + + async def run_with_trace(self, source_code: str): + trace_name = f"{self.agent_name}-lineage-agent" + trace_id = log_trace_id(f"{self.agent_name.lower()}") + with trace(trace_name, trace_id=trace_id): + return await self.run_with_mcp_servers(source_code=source_code) + + async def run(self): + try: + logger.info(f"Starting SQL lineage analysis for {self.agent_name}") + result = await self.run_with_trace(self.source_code) + logger.info(f"Completed SQL lineage analysis for {self.agent_name}") + return result + except Exception as e: + logger.error(f"Error running {self.agent_name}: {e}") + return {"error": str(e)} + + +# Plugin interface functions +def create_sql_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> SqlLineageAgent: + """Factory function to create a SqlLineageAgent instance""" + return SqlLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func) + + +def get_plugin_info() -> Dict[str, Any]: + """Return plugin metadata""" + return { + "name": "sql-lineage-agent", + "description": "SQL lineage analysis agent for parsing and analyzing SQL queries", + "version": "1.0.0", + "author": "Ali Shamsaddinlou", + "agent_class": SqlLineageAgent, + "factory_function": create_sql_lineage_agent, + "supported_operations": ["lineage_analysis"], + } \ No newline at end of file diff --git a/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/__init__.py b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98c672da4ce85f78fc9ed7c7921cbf3d6d653b05 Binary files /dev/null and b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..350cdba2907e6c1e427f751674d623fa326786f7 Binary files /dev/null and b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_params.py b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_params.py new file mode 100644 index 0000000000000000000000000000000000000000..635d9ca19a0e7808283161f832147c9a67a91d46 --- /dev/null +++ b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_params.py @@ -0,0 +1,9 @@ +import os +from dotenv import load_dotenv + +load_dotenv(override=True) + + +sql_mcp_server_params = [ + {"command": "python", "args": ["lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/lineage_sql_server.py"]}, +] diff --git a/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/__init__.py b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/__pycache__/templates.cpython-313.pyc b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/__pycache__/templates.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2aa40acf315b976df1d91aad7c901a1a2ab09c0c Binary files /dev/null and b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/__pycache__/templates.cpython-313.pyc differ diff --git a/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/lineage_sql_server.py b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/lineage_sql_server.py new file mode 100644 index 0000000000000000000000000000000000000000..36d49520866394c6dcae9520da21d060bef7dd19 --- /dev/null +++ b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/lineage_sql_server.py @@ -0,0 +1,65 @@ +import logging + +# Configure logging to suppress verbose output +logging.basicConfig(level=logging.WARNING) +logging.getLogger('mcp').setLevel(logging.WARNING) +logging.getLogger('mcp.server').setLevel(logging.WARNING) + +from mcp.server.fastmcp import FastMCP +from typing import Dict, Any + +mcp = FastMCP("lineage_sql_server") + +from templates import (sql_lineage_syntax_analysis as syntax_analysis_template, + sql_lineage_field_derivation as field_derivation_template, + sql_lineage_operation_tracing as operation_tracing_template, + sql_lineage_event_composer as event_composer_template, + sql_graph_builder as graph_builder_template) + +@mcp.tool() +async def sql_lineage_syntax_analysis() -> Dict[str, Any]: + """SQL lineage structure and syntax decomposition expert""" + return { + "instructions": syntax_analysis_template(), + "version": "1.0.0", + "capabilities": ["sql_parsing", "cte_extraction", "subquery_analysis"] + } + +@mcp.tool() +async def sql_lineage_field_derivation() -> Dict[str, Any]: + """Field mapping and field derivation expert""" + return { + "instructions": field_derivation_template(), + "version": "1.0.0", + "capabilities": ["field_mapping", "transformation_analysis", "column_lineage"] + } + +@mcp.tool() +async def sql_lineage_operation_tracing() -> Dict[str, Any]: + """Logical operator analysis and operation tracing expert""" + return { + "instructions": operation_tracing_template(), + "version": "1.0.0", + "capabilities": ["filter_analysis", "join_analysis", "aggregation_tracking"] + } + +@mcp.tool() +async def sql_lineage_event_composer() -> Dict[str, Any]: + """Event composition and aggregation expert""" + return { + "instructions": event_composer_template(), + "version": "1.0.0", + "capabilities": ["openlineage_generation", "event_composition", "metadata_aggregation"] + } + +@mcp.tool() +async def sql_lineage_graph_builder() -> Dict[str, Any]: + """Knowledge graph extraction and graph building expert""" + return { + "instructions": graph_builder_template(), + "version": "1.0.0", + "capabilities": ["graph_extraction", "node_edge_generation", "relationship_mapping"] + } + +if __name__ == "__main__": + mcp.run(transport='stdio') diff --git a/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/templates.py b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/templates.py new file mode 100644 index 0000000000000000000000000000000000000000..77d773d69bf90587b51ab0c54a56e83a51ef24e2 --- /dev/null +++ b/lf_algorithm/plugins/sql_lineage_agent/mcp_servers/mcp_sql_lineage/templates.py @@ -0,0 +1,578 @@ +from datetime import datetime + + +def sql_lineage_syntax_analysis(): + return """ + You are a SQL decomposition expert. Your task is to parse complex SQL scripts into logical subqueries, including CTEs, nested subqueries, and the final query. Return a clean JSON object of these blocks for downstream lineage processing. + Instructions: + - Extract: full CTEs, subqueries inside SELECT/FROM/WHERE, and the final main query. + - Do NOT extract individual SQL clauses (e.g., SELECT, WHERE) unless they represent a full subquery. + - Each extracted component should be a valid SQL unit that could be analyzed independently. + + Output format (JSON): + { + "sp1": { "name": "", "sql": "" }, + "sp2": { "name": "", "sql": "" }, + ... + } + + --- + + Positive Example 1: + + Input SQL: + WITH temp1 AS ( + SELECT id, value FROM table1 + ), + temp2 AS ( + SELECT id, SUM(value) as total FROM temp1 GROUP BY id + ) + SELECT * FROM temp2 WHERE total > 100; + + Expected Output: + { + "sp1": { + "name": "temp1", + "sql": "SELECT id, value FROM table1" + }, + "sp2": { + "name": "temp2", + "sql": "SELECT id, SUM(value) as total FROM temp1 GROUP BY id" + }, + "sp3": { + "name": "main_query", + "sql": "SELECT * FROM temp2 WHERE total > 100" + } + } + + --- + + Positive Example 2: + + Input SQL: + SELECT name FROM employees WHERE EXISTS ( + SELECT 1 FROM timesheets WHERE employees.id = timesheets.emp_id AND hours > 40 + ); + + Expected Output: + { + "sp1": { + "name": "subquery_exists", + "sql": "SELECT 1 FROM timesheets WHERE employees.id = timesheets.emp_id AND hours > 40" + }, + "sp2": { + "name": "main_query", + "sql": "SELECT name FROM employees WHERE EXISTS (SELECT 1 FROM timesheets WHERE employees.id = timesheets.emp_id AND hours > 40)" + } + } + + --- + + Negative Example 1 (Wrong: fragments instead of valid subqueries): + + { + "sp1": { "name": "select_clause", "sql": "SELECT id, value" }, + "sp2": { "name": "from_clause", "sql": "FROM table1" }, + "sp3": { "name": "where_clause", "sql": "WHERE value > 100" } + } + + Reason: These are not executable subqueries. They're just clauses. + + --- + + Negative Example 2 (Wrong: breaking apart a CTE): + + Input: + WITH temp AS ( + SELECT id, value FROM table1 WHERE value > 100 + ) + SELECT * FROM temp; + + Incorrect Output: + { + "sp1": { "name": "select_cte", "sql": "SELECT id, value" }, + "sp2": { "name": "where_cte", "sql": "WHERE value > 100" }, + "sp3": { "name": "main_query", "sql": "SELECT * FROM temp" } + } + + Reason: The CTE should be kept as a single logical block, not split by clause. + + """ + + +def sql_lineage_field_derivation(): + return """ + You are a field mapping analysis expert. Given a SQL subquery, your job is to extract and explain how each output field is derived from the source tables. For each output field, identify: + + 1. The **source column(s)** it depends on (directly or via intermediate expressions or aggregates) + 2. The **transformation logic** applied (e.g., direct copy, SUM, CONCAT, CASE, etc.) + + Output Format: + { + "output_fields": [ + { + "namespace": "", + "name": "", + "field": "", + "transformation": "" + }, + ... + ] + } + + --- + + Positive Example 1 + + Input SQL: + SELECT customer_id, SUM(amount) AS total_spent FROM orders GROUP BY customer_id; + + Expected Output: + { + "output_fields": [ + { + "namespace": "default", + "name": "orders", + "field": "customer_id", + "transformation": "Group key, direct" + }, + { + "namespace": "default", + "name": "orders", + "field": "amount", + "transformation": "SUM(amount)" + } + ] + } + + --- + + Negative Example 1 (Incorrect structure): + + { + "customer_id": "orders.customer_id", + "total_spent": "SUM(amount)" + } + + --- + + Negative Example 2 (Missed transformation logic): + + Input SQL: + SELECT salary * 12 AS annual_salary FROM payroll; + + Incorrect Output: + { + "output_fields": [ + { + "namespace": "default", + "name": "payroll", + "field": "salary", + "transformation": "Direct" + } + ] + } + + Reason: This ignores the expression `salary * 12`. The transformation must be `"salary multiplied by 12"` or similar. + """ + +def sql_lineage_operation_tracing(): + return """ + You are a logical operator analysis expert. Your task is to analyze a SQL subquery and extract all **logical operations** on each source table and on which fields these logical operations are applied, including: + - Only list the fields that are used in the logical operations, not all fields. + - WHERE filters + - JOIN conditions + - GROUP BY and HAVING conditions + - ORDER BY clauses + - Any logical expressions affecting rows (e.g., EXISTS, IN, CASE) + + Return the result in the following structured format: + + { + "output_fields": [ + { + "source_table": "", + "source_fields": ["", "", "..."], + "logical_operators": { + "filters": [], + "joins": [], + "group_by": [], + "having": [], + "order_by": [], + "other": [] + } + } + ] + } + + - Only include entries for logical operators if the list is non-empty. + - Represent expressions clearly and fully. + - Normalize join conditions and predicates (e.g., `a.id = b.id`, `salary > 1000`). + - Include all source tables involved and only the fields used in logical operations. + + --- + + Positive Example 1 + + Input SQL: + SELECT customer_id, SUM(amount) FROM orders WHERE region = 'US' GROUP BY customer_id HAVING SUM(amount) > 1000; + + Expected Output: + { + "output_fields": [ + { + "source_table": "orders", + "source_fields": ["region", "customer_id", "amount"], + "logical_operators": { + "filters": ["region = 'US'"], + "group_by": ["customer_id"], + "having": ["SUM(amount) > 1000"] + } + } + ] + } + + --- + + Positive Example 2 + + Input SQL: + SELECT e.name, d.dept_name FROM employees e JOIN departments d ON e.dept_id = d.id WHERE e.status = 'active' ORDER BY e.name; + + Expected Output: + { + "output_fields": [ + { + "source_table": "employees", + "source_fields": ["status", "dept_id", "name"], + "logical_operators": { + "filters": ["e.status = 'active'"], + "joins": ["e.dept_id = d.id"], + "order_by": ["e.name"] + } + }, + { + "source_table": "departments", + "source_fields": ["id"], + "logical_operators": { + "joins": ["e.dept_id = d.id"] + } + } + ] + } + + --- + + Positive Example 3 + + Input SQL: + SELECT * FROM accounts WHERE EXISTS (SELECT 1 FROM transactions WHERE accounts.id = transactions.account_id); + + Expected Output: + { + "output_fields": [ + { + "source_table": "accounts", + "source_fields": ["id"], + "logical_operators": { + "filters": ["EXISTS (SELECT 1 FROM transactions WHERE accounts.id = transactions.account_id)"] + } + } + ] + } + + --- + + Negative Example 1 (Incorrect formatting): + + { + "filters": "region = 'US'", + "group_by": "customer_id" + } + + Reason: Each value should be in a list and must be nested under `"output_fields"` with `"source_table"` and `"source_fields"` keys. + + --- + + Negative Example 2 (Missing logical clauses): + + Input SQL: + SELECT name FROM users WHERE age > 18 ORDER BY signup_date; + + Incorrect Output: + { + "output_fields": [ + { + "source_table": "users", + "source_fields": ["name", "age", "signup_date"], + "logical_operators": { + "filters": ["age > 18"] + } + } + ] + } + + Reason: The `order_by` clause is missing. + + """ + + + + +def sql_lineage_event_composer(): + return """ + You are an OpenLineage lineage generation expert. + Your job is to take the outputs from upstream SQL analysis agents and generate a **single, + complete OpenLineage event JSON** representing end-to-end data lineage for the query. + + --- + + ### You will receive: + + 1. **Parsed SQL Blocks** (CTEs and final query) in the format: + { + "sp1": { "name": "temp1", "sql": "" }, + "sp2": { "name": "temp2", "sql": "" }, + "sp3": { "name": "main_query", "sql": "" } + } + + 2. **Field Mappings**: one per SQL block (same order), in this format: + [ + { + "output_fields": [ + { + "name": "", + "source": "", + "transformation": "" + } + ] + }, + ... + ] + + 3. **Logical Operators**: one per SQL block (same order), in this format: + [ + { + "output_fields": [ + { + "source_table": "", + "source_fields": ["field1", "field2"], + "logical_operators": { + "filters": ["..."], + "joins": ["..."], + "group_by": ["..."], + "having": ["..."], + "order_by": ["..."], + "other": ["..."] + } + } + ] + }, + ... + ] + + --- + + ### Your Task: + + Generate **one event JSON** that captures the **entire query pipeline** from source tables to final output. + Strictly follow the structure below and do not change field names or nesting, it is very important to keep exact same format: + + - Include "columnLineage" as a facet under "outputs.facets" (not at the top level). + - Maintain the exact field names: "inputs", "outputs", "facets", "fields", "storage", "datasetType", "lifecycleStateChange", "ownership", ect. + - Match the structure and nesting exactly as in this format: + - Based on following example generate , , , : + + BigQuery + SELECT name, age + FROM project123.dataset456.customers; + + Expected : + or : project123 + or : dataset456.customers + + Postgres + SELECT id, total + FROM sales_schema.orders; + + Expected : + or : default + or : sales_schema.orders + + MySQL + SELECT u.username, u.email + FROM ecommerce_db.users AS u; + + Expected Output: + or : default + or : ecommerce_db.users + + - wherever you cant find information for example for , , + , , , , + , , then just write "NA". + + - very very very important: Your output must follow **exactly** this JSON structure — do not output explanations, comments, or anything else. + --- + + ### Required Output Format (Example): + + { + "inputs": [ + { + "namespace": "", + "name": "", + "facets": { + "schema": { + "fields": [ + { + "name": "", + "type": "", + "description": "" + } + ] + } + } + } + ], + "outputs": [ + { + "namespace": "", + "name": "", + "facets": { + "columnLineage": { + "fields": { + "": { + "inputFields": [ + { + "namespace": "", + "name": "", + "field": "", + "transformations": [ + { + "type": "", + "subtype": "", + "description": "", + "masking": false + } + ] + } + ] + } + } + } + } + } + ] + } + """ + + + +def sql_graph_builder(): + return """ + You are a knowledge graph extraction agent. Your task is to transform the output of SQL lineage analysis into a cohesive **knowledge graph** in JSON format, with clearly defined nodes and edges. + + Each lineage record includes: + - A subquery name and SQL + - Source tables + - Output fields with their source fields and transformations + - Logical operators applied per source table + + --- + + Your Goal: + Extract all meaningful **nodes** and **edges** that represent relationships between: + - Subqueries and source tables + - Fields and their source fields + - Transformation logic + - Logical operations (filters, joins, groupings, etc.) + + --- + + Output Schema (JSON): + + { + "nodes": [ + { "id": "", "type": "", "label": "" } + ], + "edges": [ + { "source": "", "target": "", "type": "" } + ] + } + + Node Types: + - "subquery" + - "table" + - "field" + - "operation" + + Edge Types: + - "uses_table" (subquery → table) + - "produces_field" (subquery → output field) + - "derived_from" (output field → source field) + - "transformation" (field → transformation logic) + - "applies_operator" (operation → table or field) + - "joins_with" (table → table) + - "filters_by", "grouped_by", "ordered_by", etc. + + --- + + Example Input Lineage: + + { + "name": "sales_summary", + "sql": "SELECT region, SUM(amount) as total_sales FROM orders WHERE order_date >= '2023-01-01' GROUP BY region", + "source_tables": ["orders"], + "output_fields": [ + { + "name": "region", + "source": "orders.region", + "transformation": "Direct" + }, + { + "name": "total_sales", + "source": "orders.amount", + "transformation": "SUM(amount)" + } + ], + "logical_operators": { + "filters": ["order_date >= '2023-01-01'"], + "group_by": ["region"] + } + } + + --- + + Expected Graph Output: + + { + "nodes": [ + { "id": "subq_sales_summary", "type": "subquery", "label": "sales_summary" }, + { "id": "tbl_orders", "type": "table", "label": "orders" }, + { "id": "fld_region", "type": "field", "label": "region" }, + { "id": "fld_amount", "type": "field", "label": "amount" }, + { "id": "fld_total_sales", "type": "field", "label": "total_sales" }, + { "id": "op_filter", "type": "operation", "label": "filter: order_date >= '2023-01-01'" }, + { "id": "op_groupby", "type": "operation", "label": "group by region" } + ], + "edges": [ + { "source": "subq_sales_summary", "target": "tbl_orders", "type": "uses_table" }, + { "source": "subq_sales_summary", "target": "fld_total_sales", "type": "produces_field" }, + { "source": "fld_total_sales", "target": "fld_amount", "type": "derived_from" }, + { "source": "fld_total_sales", "target": "SUM(amount)", "type": "transformation" }, + { "source": "subq_sales_summary", "target": "fld_region", "type": "produces_field" }, + { "source": "fld_region", "target": "fld_region", "type": "derived_from" }, + { "source": "op_filter", "target": "tbl_orders", "type": "filters_by" }, + { "source": "op_groupby", "target": "fld_region", "type": "grouped_by" } + ] + } + + --- + + Important Rules: + - Every node must have a unique `id` + - Edges must refer to existing `node_id`s + - Normalize identifiers (e.g., table → `tbl_`, field → `fld_`, subquery → `subq_`) + + Now, based on the lineage input, extract a structured graph with all related nodes and edges. + """ diff --git a/lf_algorithm/plugins/sql_lineage_agent/sql_instructions.py b/lf_algorithm/plugins/sql_lineage_agent/sql_instructions.py new file mode 100644 index 0000000000000000000000000000000000000000..74d5cab6a1dc842b372f4e5e2617797aac660aa7 --- /dev/null +++ b/lf_algorithm/plugins/sql_lineage_agent/sql_instructions.py @@ -0,0 +1,113 @@ +def comprehensive_analysis_instructions(name: str): + return f""" + You are the {name} SQL lineage analysis agent. + + **Your Task:** Perform complete SQL query lineage analysis in a single comprehensive process. + + **Complete Analysis Process:** + + **Step 1: Syntax Analysis** + 1. Call the sql_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the SQL query structure + 3. Store the syntax analysis results for use in subsequent steps + + **Step 2: Field Derivation** + 1. Call the sql_lineage_field_derivation() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your field mapping analysis + 3. Follow the MCP tool instructions exactly to analyze field mappings and transformations + 4. Store the field derivation results + + **Step 3: Operation Tracing** + 1. Call the sql_lineage_operation_tracing() MCP tool to get expert instructions + 2. Use the syntax analysis results from Step 1 to inform your operation analysis + 3. Follow the MCP tool instructions exactly to analyze logical operations and operators + 4. Store the operation tracing results + + **Step 4: Event Composition** + 1. Call the sql_lineage_event_composer() MCP tool to get expert instructions + 2. Combine all previous analysis results (syntax, field derivation, operation tracing) + 3. Follow the MCP tool instructions exactly to compose the final OpenLineage event + 4. Return the complete OpenLineage event + + **Important Guidelines:** + - Each MCP tool contains detailed instructions, examples, and output format requirements + - Follow the MCP tool instructions precisely for each step + - Maintain context between steps - use results from earlier steps to inform later analysis + - Ensure the final output is a complete, properly formatted OpenLineage event + - If any step fails, provide clear error information and stop the process + + **Workflow Summary:** + Syntax Analysis → Field Derivation → Operation Tracing → Event Composition → Final Output + """ + +# Keep the individual instructions for backward compatibility if needed +def syntax_analysis_instructions(name: str): + return f""" + You are the {name} SQL lineage analysis agent. + + **Your Task:** Analyze the provided SQL query for syntax structure. + + **Process:** + 1. Call the sql_lineage_syntax_analysis() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze the SQL query + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def field_derivation_instructions(name: str): + return f""" + You are the {name} SQL lineage analysis agent. + + **Your Task:** Analyze field mappings and transformations in the SQL query. + + **Process:** + 1. Call the sql_lineage_field_derivation() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze field mappings + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def operation_tracing_instructions(name: str): + return f""" + You are the {name} SQL lineage analysis agent. + + **Your Task:** Analyze logical operations and operators in the SQL query. + + **Process:** + 1. Call the sql_lineage_operation_tracing() MCP tool to get expert instructions + 2. Follow those instructions exactly to analyze logical operations + 3. Return the analysis results in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def event_composer_instructions(name: str): + return f""" + You are the {name} SQL lineage analysis agent. + + **Your Task:** Compose OpenLineage events from the provided analysis data. + + **Process:** + 1. Call the sql_lineage_event_composer() MCP tool to get expert instructions + 2. Follow those instructions exactly to compose the OpenLineage event + 3. Return the event in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + +def graph_builder_instructions(name: str): + return f""" + You are the {name} SQL lineage analysis agent. + + **Your Task:** Build knowledge graph from the SQL query analysis. + + **Process:** + 1. Call the sql_lineage_graph_builder() MCP tool to get expert instructions + 2. Follow those instructions exactly to build the knowledge graph + 3. Return the graph in the format specified by the MCP tool + + **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely. + """ + \ No newline at end of file diff --git a/lf_algorithm/tools/renderer/__init__.py b/lf_algorithm/tools/renderer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lf_algorithm/utils/__init__.py b/lf_algorithm/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..03b5a6c806542aa697039d71a1ee072368848955 --- /dev/null +++ b/lf_algorithm/utils/__init__.py @@ -0,0 +1,12 @@ +from .logging_config import get_logger, setup_logging, configure_logging, write_lineage_log +from .model_manager import get_model, get_api_clients, validate_api_keys + +__all__ = [ + 'get_logger', + 'setup_logging', + 'configure_logging', + 'write_lineage_log', + 'get_model', + 'get_api_clients', + 'validate_api_keys' +] diff --git a/lf_algorithm/utils/__pycache__/__init__.cpython-313.pyc b/lf_algorithm/utils/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..404928a0a2b4a4bbbc9dcd9c8e621ce65b6317fc Binary files /dev/null and b/lf_algorithm/utils/__pycache__/__init__.cpython-313.pyc differ diff --git a/lf_algorithm/utils/__pycache__/file_utils.cpython-313.pyc b/lf_algorithm/utils/__pycache__/file_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f06c2a5ac4b0f1cf12f2c0b1f21ca7dc47b3b58 Binary files /dev/null and b/lf_algorithm/utils/__pycache__/file_utils.cpython-313.pyc differ diff --git a/lf_algorithm/utils/__pycache__/logging_config.cpython-313.pyc b/lf_algorithm/utils/__pycache__/logging_config.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6c0aa3324964e679c7d06ac2002b8e301145fe3 Binary files /dev/null and b/lf_algorithm/utils/__pycache__/logging_config.cpython-313.pyc differ diff --git a/lf_algorithm/utils/__pycache__/model_manager.cpython-313.pyc b/lf_algorithm/utils/__pycache__/model_manager.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6560bc209bda8079119f446ad16cee5595c72a01 Binary files /dev/null and b/lf_algorithm/utils/__pycache__/model_manager.cpython-313.pyc differ diff --git a/lf_algorithm/utils/__pycache__/tracers.cpython-313.pyc b/lf_algorithm/utils/__pycache__/tracers.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e6053398d830337f3bedf778481648763ace7b5 Binary files /dev/null and b/lf_algorithm/utils/__pycache__/tracers.cpython-313.pyc differ diff --git a/lf_algorithm/utils/file_utils.py b/lf_algorithm/utils/file_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..150b4326e38a04976c59a3c1c2f9837587303a4e --- /dev/null +++ b/lf_algorithm/utils/file_utils.py @@ -0,0 +1,162 @@ +import json +import os +import re +from pathlib import Path +from typing import Dict, Any, Optional, Union +from datetime import datetime + + +def clean_json_string(text: str) -> str: + """ + Clean a string that might contain markdown formatting and extract just the JSON content. + """ + # Remove markdown code blocks + text = re.sub(r'```json\s*\n?', '', text) + text = re.sub(r'```\s*\n?', '', text) + + # Remove leading/trailing whitespace and newlines + text = text.strip() + + return text + + +def dump_json_record(filename: str, record: Union[Dict[str, Any], str], lineage_extraction_dumps_folder: str = "lineage_extraction_dumps") -> Union[Dict[str, Any], str]: + """ + Create a file under the lineagedb folder and dump a JSON record as a new line. + + Args: + filename (str): The name of the file (without extension, .json will be added) + record (Union[Dict[str, Any], str]): The JSON record to dump (can be dict or string) + lineage_extraction_dumps_folder (str): The folder name for lineage database files (default: "lineage_extraction_dumps") + + Returns: + Union[Dict[str, Any], str]: The processed record that was dumped to the file + + Example: + dumped_data = dump_json_record("user_queries", {"query": "SELECT * FROM users"}) + dumped_data = dump_json_record("outputs", "This is a string output") + """ + # Create the lineagedb folder if it doesn't exist + # folder_path = Path(lineage_extraction_dumps_folder) + # folder_path.mkdir(exist_ok=True) + + # Create the full file path with .json extension + # file_path = folder_path / f"{filename}.json" + + # Handle different input types + if isinstance(record, str): + # Clean the string first to remove any markdown formatting + cleaned_record = clean_json_string(record) + + # Try to parse as JSON first, then re-serialize properly + try: + # Parse the string as JSON to get the actual data + parsed_data = json.loads(cleaned_record) + # Re-serialize without escaping newlines and with proper formatting + json_line = json.dumps(parsed_data, ensure_ascii=False, separators=(',', ':')) + processed_record = parsed_data + except json.JSONDecodeError: + # If it's not valid JSON, treat it as a plain string + json_line = json.dumps(cleaned_record, ensure_ascii=False) + processed_record = cleaned_record + + elif isinstance(record, dict): + # If it's already a dict, convert to JSON string + json_line = json.dumps(record, ensure_ascii=False, separators=(',', ':')) + processed_record = record + else: + # For other types, convert to string and then to JSON + cleaned_record = clean_json_string(str(record)) + try: + parsed_data = json.loads(cleaned_record) + json_line = json.dumps(parsed_data, ensure_ascii=False, separators=(',', ':')) + processed_record = parsed_data + except json.JSONDecodeError: + json_line = json.dumps(cleaned_record, ensure_ascii=False) + processed_record = cleaned_record + + # Append the JSON record as a new line to the file + # with open(file_path, "a", encoding="utf-8") as f: + # f.write(json_line + "\n") + + return processed_record + + +def read_json_records(filename: str, lineagedb_folder: str = "lineagedb") -> list: + """ + Read all JSON records from a file in the lineagedb folder. + + Args: + filename (str): The name of the file (without extension) + lineagedb_folder (str): The folder name for lineage database files (default: "lineagedb") + + Returns: + list: List of dictionaries containing the JSON records + """ + folder_path = Path(lineagedb_folder) + file_path = folder_path / f"{filename}.json" + + records = [] + if file_path.exists(): + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: # Skip empty lines + try: + record = json.loads(line) + records.append(record) + except json.JSONDecodeError as e: + print(f"Warning: Could not parse JSON line: {line[:50]}... Error: {e}") + + return records + + +def clear_json_file(filename: str, lineagedb_folder: str = "lineagedb") -> None: + """ + Clear all records from a JSON file in the lineagedb folder. + + Args: + filename (str): The name of the file (without extension) + lineagedb_folder (str): The folder name for lineage database files (default: "lineagedb") + """ + folder_path = Path(lineagedb_folder) + file_path = folder_path / f"{filename}.json" + + if file_path.exists(): + file_path.unlink() # Delete the file + print(f"Cleared file: {file_path}") + + +def get_file_stats(filename: str, lineagedb_folder: str = "lineagedb") -> Dict[str, Any]: + """ + Get statistics about a JSON file in the lineagedb folder. + + Args: + filename (str): The name of the file (without extension) + lineagedb_folder (str): The folder name for lineage database files (default: "lineagedb") + + Returns: + Dict[str, Any]: Statistics about the file including record count, file size, etc. + """ + folder_path = Path(lineagedb_folder) + file_path = folder_path / f"{filename}.json" + + stats = { + "filename": f"{filename}.json", + "exists": file_path.exists(), + "record_count": 0, + "file_size_bytes": 0, + "created_time": None, + "modified_time": None + } + + if file_path.exists(): + stats["file_size_bytes"] = file_path.stat().st_size + stats["created_time"] = datetime.fromtimestamp(file_path.stat().st_ctime).isoformat() + stats["modified_time"] = datetime.fromtimestamp(file_path.stat().st_mtime).isoformat() + + # Count records + with open(file_path, "r", encoding="utf-8") as f: + stats["record_count"] = sum(1 for line in f if line.strip()) + + return stats \ No newline at end of file diff --git a/lf_algorithm/utils/logging_config.py b/lf_algorithm/utils/logging_config.py new file mode 100644 index 0000000000000000000000000000000000000000..75076743d2989eb179a43d45c841d5703664004d --- /dev/null +++ b/lf_algorithm/utils/logging_config.py @@ -0,0 +1,120 @@ +import logging +from typing import Optional +from datetime import datetime +from enum import Enum + +# Add NullHandler to prevent "No handler could be found" warnings +# This is the only logging configuration the library should do +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +# Color enum for console output (from database.py) +class Color(Enum): + WHITE = "\033[97m" + CYAN = "\033[96m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + MAGENTA = "\033[95m" + RED = "\033[91m" + RESET = "\033[0m" + +# Color mapping for different log types (from database.py) +color_mapper = { + "trace": Color.WHITE, + "agent": Color.CYAN, + "function": Color.GREEN, + "generation": Color.YELLOW, + "response": Color.MAGENTA, + "account": Color.RED, + "span": Color.CYAN, # Default for span type +} + +def get_logger(name: str) -> logging.Logger: + """ + Get a logger with the specified name. + + This is a simple wrapper around logging.getLogger() that ensures + the library follows proper logging patterns. + + Args: + name: The name for the logger (usually __name__) + + Returns: + logging.Logger: Logger instance + """ + return logging.getLogger(name) + +def write_lineage_log(name: str, type: str, message: str): + """ + Write a log entry using standard logging. + + This function uses standard logging instead of writing to files directly. + The application is responsible for configuring where logs go (console, files, etc.). + + Args: + name (str): The name associated with the log + type (str): The type of log entry + message (str): The log message + """ + # Map log types to standard logging levels + type_to_level = { + "trace": logging.INFO, + "agent": logging.INFO, + "function": logging.DEBUG, + "generation": logging.INFO, + "response": logging.INFO, + "account": logging.WARNING, + "span": logging.DEBUG, + "error": logging.ERROR, + "warning": logging.WARNING, + "info": logging.INFO, + "debug": logging.DEBUG + } + + level = type_to_level.get(type.lower(), logging.INFO) + + # Use the lineage logger with structured data + lineage_logger = logging.getLogger(f"lf_algorithm.lineage.{name}") + + # Log with structured context + lineage_logger.log( + level, + f"{type}: {message}", + extra={ + "lineage_name": name, + "lineage_type": type, + "lineage_datetime": datetime.now().isoformat() + } + ) + +# Legacy functions for backward compatibility - these now just return loggers +# without any configuration, as the application should handle all configuration +def setup_logging( + level: int = None, + log_to_file: bool = None, + log_to_console: bool = None, + use_colors: bool = None +) -> None: + """ + Legacy function - does nothing in library mode. + + Applications should configure logging themselves using: + - logging.basicConfig() for simple setups + - logging.config.dictConfig() for advanced setups + """ + # This function is kept for backward compatibility but does nothing + # The library should not configure logging - that's the application's job + pass + +def configure_logging( + log_dir: str = None, + log_file: str = None, + enabled: bool = None +) -> None: + """ + Legacy function - does nothing in library mode. + + Applications should configure logging themselves. + """ + # This function is kept for backward compatibility but does nothing + # The library should not configure logging - that's the application's job + pass diff --git a/lf_algorithm/utils/model_manager.py b/lf_algorithm/utils/model_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..893107cf18c6146e6cea1c2e839d5c3719b6a5db --- /dev/null +++ b/lf_algorithm/utils/model_manager.py @@ -0,0 +1,87 @@ +import os +import logging +from openai import AsyncOpenAI +from agents import OpenAIChatCompletionsModel +from dotenv import load_dotenv + +# Get logger for this module +logger = logging.getLogger(__name__) + +# Load environment variables +load_dotenv(override=True) + +# API Keys +DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY") +GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") +GROK_API_KEY = os.getenv("GROK_API_KEY") +OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") + +# Base URLs +DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1" +GROK_BASE_URL = "https://api.x.ai/v1" +GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/" +OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" + +# Initialize API clients +openrouter_client = AsyncOpenAI(base_url=OPENROUTER_BASE_URL, api_key=OPENROUTER_API_KEY) +deepseek_client = AsyncOpenAI(base_url=DEEPSEEK_BASE_URL, api_key=DEEPSEEK_API_KEY) +grok_client = AsyncOpenAI(base_url=GROK_BASE_URL, api_key=GROK_API_KEY) +gemini_client = AsyncOpenAI(base_url=GEMINI_BASE_URL, api_key=GOOGLE_API_KEY) + + +def get_model(model_name: str): + """ + Get the appropriate model based on the model name. + + Args: + model_name (str): The name of the model to use + + Returns: + OpenAIChatCompletionsModel or str: The model instance or model name + """ + if "/" in model_name: + return OpenAIChatCompletionsModel(model=model_name, openai_client=openrouter_client) + elif "deepseek" in model_name: + return OpenAIChatCompletionsModel(model=model_name, openai_client=deepseek_client) + elif "grok" in model_name: + return OpenAIChatCompletionsModel(model=model_name, openai_client=grok_client) + elif "gemini" in model_name: + return OpenAIChatCompletionsModel(model=model_name, openai_client=gemini_client) + else: + return model_name + + +def get_api_clients(): + """ + Get all API clients for external use if needed. + + Returns: + dict: Dictionary containing all API clients + """ + return { + 'openrouter': openrouter_client, + 'deepseek': deepseek_client, + 'grok': grok_client, + 'gemini': gemini_client + } + + +def validate_api_keys(): + """ + Validate that required API keys are available. + + Returns: + dict: Dictionary with validation results for each API + """ + validation_results = { + 'openrouter': bool(OPENROUTER_API_KEY), + 'deepseek': bool(DEEPSEEK_API_KEY), + 'grok': bool(GROK_API_KEY), + 'gemini': bool(GOOGLE_API_KEY) + } + + missing_keys = [key for key, available in validation_results.items() if not available] + if missing_keys: + logger.warning(f"Missing API keys for: {', '.join(missing_keys)}") + + return validation_results diff --git a/lf_algorithm/utils/tracers.py b/lf_algorithm/utils/tracers.py new file mode 100644 index 0000000000000000000000000000000000000000..051b8e6416931cc03df8f79c66dd330c00139525 --- /dev/null +++ b/lf_algorithm/utils/tracers.py @@ -0,0 +1,78 @@ +from agents import TracingProcessor, Trace, Span +import sys +import os + +from .logging_config import write_lineage_log +import secrets +import string + +ALPHANUM = string.ascii_lowercase + string.digits + +def log_trace_id(tag: str) -> str: + """ + Return a string of the form 'trace_', + where the total length after 'trace_' is 32 chars. + """ + tag += "0" + pad_len = 32 - len(tag) + random_suffix = ''.join(secrets.choice(ALPHANUM) for _ in range(pad_len)) + return f"trace_{tag}{random_suffix}" + +class LogTracer(TracingProcessor): + + def get_name(self, trace_or_span: Trace | Span) -> str | None: + trace_id = trace_or_span.trace_id + name = trace_id.split("_")[1] + if '0' in name: + return name.split("0")[0] + else: + return None + + def on_trace_start(self, trace) -> None: + name = self.get_name(trace) + if name: + write_lineage_log(name, "trace", f"Started: {trace.name}") + + def on_trace_end(self, trace) -> None: + name = self.get_name(trace) + if name: + write_lineage_log(name, "trace", f"Ended: {trace.name}") + + def on_span_start(self, span) -> None: + name = self.get_name(span) + type = span.span_data.type if span.span_data else "span" + if name: + message = "Started" + if span.span_data: + if span.span_data.type: + message += f" {span.span_data.type}" + if hasattr(span.span_data, "name") and span.span_data.name: + message += f" {span.span_data.name}" + if hasattr(span.span_data, "server") and span.span_data.server: + message += f" {span.span_data.server}" + if span.error: + message += f" {span.error}" + write_lineage_log(name, type, message) + + def on_span_end(self, span) -> None: + name = self.get_name(span) + type = span.span_data.type if span.span_data else "span" + if name: + message = "Ended" + if span.span_data: + if span.span_data.type: + + message += f" {span.span_data.type}" + if hasattr(span.span_data, "name") and span.span_data.name: + message += f" {span.span_data.name}" + if hasattr(span.span_data, "server") and span.span_data.server: + message += f" {span.span_data.server}" + if span.error: + message += f" {span.error}" + write_lineage_log(name, type, message) + + def force_flush(self) -> None: + pass + + def shutdown(self) -> None: + pass \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..32c524886bf32a1a7cf508f5ce04f9e2d58ef0d7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,98 @@ +[project] +name = "lineagentic-flow" +version = "1.0.2" +description = "Lineagentic-flow is agentic ai approach for building data lineage across diverse data processing scripts including python, sql, java, airflow, spark, etc." +readme = "README.md" +requires-python = ">=3.13" +license = "MIT" +authors = [ + {name = "Lineagentic Flow Team", email = "team@lineagentic.com"} +] +keywords = ["data-lineage", "ai-agents", "data-processing", "lineage-tracking"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.13", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +dependencies = [ + "anthropic>=0.49.0", + "autogen-agentchat==0.6.1", + "autogen-ext[grpc,mcp,ollama,openai]==0.6.1", + "bs4>=0.0.2", + "fastapi>=0.115.0", + "gradio>=5.22.0", + "httpx>=0.28.1", + "ipykernel>=6.29.5", + "ipython>=8.0.0,<9.0.0", + "ipywidgets>=8.1.5", + "jupyter>=1.1.1", + "langchain-anthropic>=0.3.10", + "langchain-community>=0.3.20", + "langchain-experimental>=0.3.4", + "langchain-openai>=0.3.9", + "langgraph>=0.3.18", + "langgraph-checkpoint-sqlite>=2.0.6", + "langsmith>=0.3.18", + "lxml>=5.3.1", + "mcp-server-fetch>=2025.1.17", + "mcp[cli]>=1.5.0", + "mysql-connector-python>=8.0.0", + "nbformat>=4.2.0", + "neo4j>=5.20.0", + "openai==1.91.0", + "openai-agents==0.0.19", + "playwright>=1.51.0", + "plotly>=6.0.1", + "polygon-api-client>=1.14.5", + "psutil>=7.0.0", + "pypdf>=5.4.0", + "pypdf2>=3.0.1", + "python-dotenv>=1.0.1", + "requests>=2.32.3", + "semantic-kernel>=1.25.0", + "sendgrid>=6.11.0", + "setuptools>=78.1.0", + "smithery>=0.1.0", + "speedtest-cli>=2.1.3", + "watchdog>=6.0.0", + "wikipedia>=1.4.0", +] + +[project.optional-dependencies] +dev = [ + "ipykernel>=6.29.5", + "pytest>=8.0.0", + "pytest-asyncio>=0.24.0", +] + +[project.urls] +Homepage = "https://github.com/lineagentic/lineagentic-flow" +Documentation = "https://lineagentic-flow.readthedocs.io" +Repository = "https://github.com/lineagentic/lineagentic-flow" +Issues = "https://github.com/lineagentic/lineagentic-flow/issues" + +[project.entry-points."lineagentic.lf_algorithm.plugins"] +sql-lineage-agent = "lf_algorithm.plugins.sql_lineage_agent.lineage_agent:get_plugin_info" +python-lineage-agent = "lf_algorithm.plugins.python_lineage_agent.lineage_agent:get_plugin_info" +airflow-lineage-agent = "lf_algorithm.plugins.airflow_lineage_agent.lineage_agent:get_plugin_info" +java-lineage-agent = "lf_algorithm.plugins.java_lineage_agent.lineage_agent:get_plugin_info" +spark-lineage-agent = "lf_algorithm.plugins.spark_lineage_agent.lineage_agent:get_plugin_info" + +[project.scripts] +lineagentic = "cli.main:main" + +[build-system] +requires = ["setuptools>=78.1.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +include = ["lf_algorithm*", "cli*"] + +[tool.setuptools.package-data] +"lf_algorithm" = [ "*.md"] +"cli" = ["*.md"] + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e07c9481cb18ffa0ab31a4facdcda9132d6100e9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,42 @@ +# Deployment-specific requirements for Hugging Face Spaces +# Minimal requirements for demo deployment + +# Core dependencies +anthropic>=0.49.0 +autogen-agentchat==0.6.1 +autogen-ext[grpc,mcp,ollama,openai]==0.6.1 +bs4>=0.0.2 +fastapi>=0.115.0 +gradio>=5.22.0,<6.0.0 +httpx>=0.28.1 +langchain-anthropic>=0.3.10 +langchain-community>=0.3.20 +langchain-experimental>=0.3.4 +langchain-openai>=0.3.9 +langgraph>=0.3.18 +langgraph-checkpoint-sqlite>=2.0.6 +langsmith>=0.3.18 +lxml>=5.3.1 +mcp-server-fetch>=2025.1.17 +mcp[cli]>=1.5.0 +mysql-connector-python>=8.0.0 +neo4j>=5.20.0 +openai==1.91.0 +openai-agents==0.0.19 +plotly>=6.0.1 +polygon-api-client>=1.14.5 +psutil>=7.0.0 +pypdf>=5.4.0 +pypdf2>=3.0.1 +python-dotenv>=1.0.1 +requests>=2.32.3 +semantic-kernel>=1.25.0 +sendgrid>=6.11.0 +setuptools>=78.1.0 +smithery>=0.1.0 +speedtest-cli>=2.1.3 +watchdog>=6.0.0 +wikipedia>=1.4.0 + +# Note: lineagentic-flow package will be installed locally after files are copied +# The package is not included here to avoid pip installation issues during build diff --git a/start_demo_server.py b/start_demo_server.py new file mode 100644 index 0000000000000000000000000000000000000000..e463034f79eb943b1510c02953891dd2f00f7977 --- /dev/null +++ b/start_demo_server.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +""" +Startup script for the SQL Lineage Analysis Demo Server +""" + +import os +import sys +import gradio as gr +from pathlib import Path + + + +def main(): + """Start the demo server with configuration""" + + # Run deployment setup if needed + if os.path.exists("deploy_setup.py"): + print("🔧 Running deployment setup...") + try: + import deploy_setup + success = deploy_setup.install_local_package() + if success: + print("✅ Package installation completed successfully") + else: + print("❌ Package installation failed") + except Exception as e: + print(f"⚠️ Deployment setup failed: {e}") + else: + print("⚠️ deploy_setup.py not found, skipping package installation") + + # Configuration + host = os.getenv("DEMO_HOST", "0.0.0.0") + port = int(os.getenv("DEMO_PORT", "7860")) + share = os.getenv("DEMO_SHARE", "false").lower() == "true" + inbrowser = os.getenv("DEMO_INBROWSER", "true").lower() == "true" + debug = os.getenv("DEMO_DEBUG", "false").lower() == "true" + + print(f"Starting SQL Lineage Analysis Demo Server...") + print(f"Host: {host}") + print(f"Port: {port}") + print(f"Share: {share}") + print(f"Open in browser: {inbrowser}") + print(f"Debug mode: {debug}") + print(f"Demo Interface: http://{host}:{port}") + print() + + try: + + # Import and run the demo server + from demo_server import SQLLineageFrontend + + frontend = SQLLineageFrontend() + ui = frontend.create_ui() + + # Launch the Gradio interface + ui.launch( + server_name=host, + server_port=port, + share=share, + inbrowser=inbrowser, + debug=debug, + show_error=True + ) + + except KeyboardInterrupt: + print("\nShutting down demo server...") + except Exception as e: + print(f"Error starting demo server: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file