Spaces:

alishams21
/

lineagentic-flow

Sleeping

App Files Files Community

alishams21 commited on Aug 18

Commit

e00e744

verified ·

1 Parent(s): 035331b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +197 -0
MANIFEST.in +39 -0
README.md +203 -8
cli/README.md +167 -0
cli/__init__.py +5 -0
cli/main.py +238 -0
demo_server.py +321 -0
deploy_setup.py +43 -0
lf_algorithm/__init__.py +46 -0
lf_algorithm/__pycache__/__init__.cpython-313.pyc +0 -0
lf_algorithm/__pycache__/agent_manager.cpython-313.pyc +0 -0
lf_algorithm/__pycache__/framework_agent.cpython-313.pyc +0 -0
lf_algorithm/agent_manager.py +84 -0
lf_algorithm/framework_agent.py +130 -0
lf_algorithm/models/__pycache__/models.cpython-313.pyc +0 -0
lf_algorithm/models/models.py +285 -0
lf_algorithm/plugins/__init__.py +1 -0
lf_algorithm/plugins/__pycache__/__init__.cpython-313.pyc +0 -0
lf_algorithm/plugins/airflow_lineage_agent/__init__.py +1 -0
lf_algorithm/plugins/airflow_lineage_agent/__pycache__/__init__.cpython-313.pyc +0 -0
lf_algorithm/plugins/airflow_lineage_agent/__pycache__/airflow_instructions.cpython-313.pyc +0 -0
lf_algorithm/plugins/airflow_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc +0 -0
lf_algorithm/plugins/airflow_lineage_agent/airflow_instructions.py +98 -0
lf_algorithm/plugins/airflow_lineage_agent/lineage_agent.py +98 -0
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__init__.py +0 -0
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc +0 -0
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc +0 -0
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/__init__.py +0 -0
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py +55 -0
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/templates.py +777 -0
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_params.py +9 -0
lf_algorithm/plugins/java_lineage_agent/__init__.py +1 -0
lf_algorithm/plugins/java_lineage_agent/__pycache__/__init__.cpython-313.pyc +0 -0
lf_algorithm/plugins/java_lineage_agent/__pycache__/java_instructions.cpython-313.pyc +0 -0
lf_algorithm/plugins/java_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc +0 -0
lf_algorithm/plugins/java_lineage_agent/java_instructions.py +98 -0
lf_algorithm/plugins/java_lineage_agent/lineage_agent.py +97 -0
lf_algorithm/plugins/java_lineage_agent/mcp_servers/__init__.py +0 -0
lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc +0 -0
lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc +0 -0
lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/__init__.py +0 -0
lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py +55 -0
lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/templates.py +605 -0
lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_params.py +9 -0
lf_algorithm/plugins/python_lineage_agent/__init__.py +1 -0
lf_algorithm/plugins/python_lineage_agent/__pycache__/__init__.cpython-313.pyc +0 -0
lf_algorithm/plugins/python_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc +0 -0
lf_algorithm/plugins/python_lineage_agent/__pycache__/python_instructions.cpython-313.pyc +0 -0
lf_algorithm/plugins/python_lineage_agent/lineage_agent.py +97 -0
lf_algorithm/plugins/python_lineage_agent/mcp_servers/__init__.py +0 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,197 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (which shall not include communications that are clearly marked or
+      otherwise designated in writing by the copyright owner as "Not a Contribution").
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to use, reproduce, modify, merge, publish,
+      distribute, sublicense, and/or sell copies of the Work, and to
+      permit persons to whom the Work is furnished to do so, subject to
+      the following conditions:
+      The above copyright notice and this permission notice shall be
+      included in all copies or substantial portions of the Work.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, trademark, patent, and
+          other attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright notice to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. When redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and to charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same page as the copyright notice for easier identification within
+      third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,39 @@

+include README.md
+include LICENSE
+include pyproject.toml
+include MANIFEST.in
+# Include images directory for README.md
+recursive-include images *
+# Include package data
+recursive-include algorithm *.json
+recursive-include algorithm *.yaml
+recursive-include algorithm *.yml
+recursive-include algorithm *.txt
+recursive-include algorithm *.md
+recursive-include cli *.json
+recursive-include cli *.yaml
+recursive-include cli *.yml
+recursive-include cli *.txt
+recursive-include cli *.md
+# Include templates and configuration files
+recursive-include lf_algorithm/plugins/*/mcp_servers/*/templates.py
+recursive-include lf_algorithm/plugins/*/mcp_servers/*/mcp_params.py
+# Exclude development files
+global-exclude *.pyc
+global-exclude *.pyo
+global-exclude __pycache__
+global-exclude .DS_Store
+global-exclude *.log
+global-exclude .pytest_cache
+global-exclude .mypy_cache
+global-exclude .venv
+global-exclude venv
+global-exclude env
+global-exclude .env
+global-exclude .pypirc
+global-exclude .ruff_cache

README.md CHANGED Viewed

@@ -1,12 +1,207 @@
 ---
-title: Lineagentic Flow
-emoji: ⚡
-colorFrom: blue
-colorTo: purple
 sdk: gradio
-sdk_version: 5.42.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: lineagentic-flow
+app_file: start_demo_server.py
 sdk: gradio
+sdk_version: 5.39.0
 ---
+<div align="center">
+  <img src="https://raw.githubusercontent.com/lineagentic/lineagentic-flow/main/images/logo.jpg" alt="Lineagentic Logo" width="880" height="300">
+</div>
+## Lineagentic-flow
+Lineagentic-flow is an agentic ai solution for building end-to-end data lineage across diverse types of data processing scripts across different platforms. It is designed to be modular and customizable, and can be extended to support new data processing script types. In a nutshell this is what it does:
+```
+┌─────────────┐    ┌───────────────────────────────┐    ┌────────────---───┐
+│ source-code │───▶│   lineagentic-flow-algorithm  │───▶│  lineage output  │
+│             │    │                               │    │                  │
+└─────────────┘    └───────────────────────────────┘    └──────────────---─┘
+```
+### Features
+- Plugin based design pattern, simple to extend and customize.
+- Command line interface for quick analysis.
+- Support for multiple data processing script types (SQL, Python, Airflow Spark, etc.)
+- Simple demo server to run locally and in huggingface spaces.
+## Quick Start
+### Installation
+Install the package from PyPI:
+```bash
+pip install lineagentic-flow
+```
+### Basic Usage
+```python
+import asyncio
+from lf_algorithm.framework_agent import FrameworkAgent
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+async def main():
+    # Create an agent for SQL lineage extraction
+    agent = FrameworkAgent(
+        agent_name="sql-lineage-agent",
+        model_name="gpt-4o-mini",
+        source_code="SELECT id, name FROM users WHERE active = true"
+    )
+    # Run the agent to extract lineage
+    result = await agent.run_agent()
+    print(result)
+# Run the example
+asyncio.run(main())
+```
+### Supported Agents
+Following table shows the current development agents in Lineagentic-flow algorithm:
+| **Agent Name**       | **Done** | **Under Development** | **In Backlog** | **Comment**                          |
+|----------------------|:--------:|:----------------------:|:--------------:|--------------------------------------|
+| python-lineage_agent    | ✓        |                        |                |       |
+| airflow_lineage_agent       |    ✓        |                      |                |             |
+| java_lineage_agent      |       ✓     |                        |              |           |
+| spark_lineage_agent        |  ✓          |                       |                |       |
+| sql_lineage_agent      | ✓        |                        |                |            |
+| flink_lineage_agent         |          |                        | ✓              |            |
+| beam_lineage_agent         |          |                        | ✓              |            |
+| shell_lineage_agent         |          |                        | ✓              |            |
+| scala_lineage_agent         |          |                        | ✓              |            |
+| dbt_lineage_agent         |          |                        | ✓              |            |
+### Environment Variables
+Set your API keys:
+```bash
+export OPENAI_API_KEY="your-openai-api-key"
+export HF_TOKEN="your-huggingface-token"  # Optional
+```
+## What are the components of Lineagentic-flow?
+- Algorithm module: This is the brain of the Lineagentic-flow. It contains agents, which are implemented as plugins and acting as chain of thought process to extract lineage from different types of data processing scripts. The module is built using a plugin-based design pattern, allowing you to easily develop and integrate your own custom agents.
+- CLI module: is for command line around algorithm API and connect to unified service layer
+- Demo module: is for teams who want to demo Lineagentic-flow in fast and simple way deployable into huggingface spaces.
+#### Command Line Interface (CLI)
+Lineagentic-flow provides a powerful CLI tool for quick analysis:
+```bash
+# Basic SQL query analysis
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT user_id, name FROM users WHERE active = true" --verbose
+# Analyze with lineage configuration
+lineagentic analyze --agent-name python-lineage-agent --query-file "my_script.py" --verbose
+```
+for more details see [CLI documentation](cli/README.md).
+### environment variables
+- HF_TOKEN   (HUGGINGFACE_TOKEN)
+- OPENAI_API_KEY
+### Architecture
+The following figure illustrates the architecture behind the Lineagentic-flow, which is essentially a multi-layer architecture of backend and agentic AI algorithm that leverages a chain-of-thought process to construct lineage across various script types.
+![Architecture Diagram](https://raw.githubusercontent.com/lineagentic/lineagentic-flow/main/images/architecture.png)
+## Mathematic behind algorithm
+Following shows mathematic behind each layer of algorithm.
+### Agent framework
+The agent framework dose IO operations ,memory management, and prompt engineering according to the script type (T) and its content (C).
+$$
+P := f(T, C)
+$$
+## Runtime orchestration agent
+The runtime orchestration agent orchestrates the execution of the required agents provided by the agent framework (P) by selecting the appropriate agent (A) and its corresponding task (T).
+$$
+G=h([\{(A_1, T_1), (A_2, T_2), (A_3, T_3), (A_4, T_4)\}],P)
+$$
+## Syntax Analysis Agent
+Syntax Analysis agent, analyzes the syntactic structure of the raw script to identify subqueries and nested structures and decompose the script into multiple subscripts.
+$$
+\{sa1,⋯,san\}:=h([A_1,T_1],P)
+$$
+## Field Derivation Agent
+The Field Derivation agent processes each subscript from syntax analysis agent to derive field-level mapping relationships and processing logic.
+$$
+\{fd1,⋯,fdn\}:=h([A_2,T_2],\{sa1,⋯,san\})
+$$
+## Operation Tracing Agent
+The Operation Tracing agent analyzes the complex conditions within each subscript identified in syntax analysis agent including filter conditions, join conditions, grouping conditions, and sorting conditions.
+$$
+\{ot1,⋯,otn\}:=h([A_3,T_3],\{sa1,⋯,san\})
+$$
+## Event Composer Agent
+The Event Composer agent consolidates the results from the syntax analysis agent, the field derivation agent and the operation tracing agent to generate the final lineage result.
+$$
+\{A\}:=h([A_4,T_4],\{sa1,⋯,san\},\{fd1,⋯,fdn\},\{ot1,⋯,otn\})
+$$
+## Activation and Deployment
+To simplify the usage of Lineagentic-flow, a Makefile has been created to manage various activation and deployment tasks. You can explore the available targets directly within the Makefile. Here you can find different strategies but for more details look into Makefile.
+1- to start demo server:
+```bash
+make start-demo-server
+```
+2- to do all tests:
+```bash
+make test
+```
+3- to build package:
+```bash
+make build-package
+```
+4- to clean all stack:
+```bash
+make clean-all-stack
+```
+5- In order to deploy Lineagentic-flow to Hugging Face Spaces, run the following command ( you need to have huggingface account and put secret keys there if you are going to use paid models):
+```bash
+make gradio-deploy
+```

cli/README.md ADDED Viewed

	@@ -0,0 +1,167 @@

+# Lineagentic-flow CLI
+A command-line interface for the Lineagentic-flow framework that provides agentic data lineage parsing across various data processing script types.
+## Installation
+The CLI is automatically installed when you install the lineagentic-flow package:
+```bash
+pip install -e .
+```
+## Usage
+The CLI provides two main commands: `analyze` and `field-lineage`.
+### Basic Commands
+#### Analyze Query/Code for Lineage
+```bash
+lineagentic analyze --agent-name sql-lineage-agent --query "your code here"
+```
+### Running Analysis
+#### Using a Specific Agent
+```bash
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT a,b FROM table1"
+```
+#### Using a File as Input
+```bash
+lineagentic analyze --agent-name python-lineage-agent --query-file path/to/your/script.py
+```
+#### Specifying a Different Model
+```bash
+lineagentic analyze --agent-name airflow-lineage-agent --model-name gpt-4o --query "your code here"
+```
+#### With Lineage Configuration
+```bash
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --job-namespace "my-namespace" --job-name "my-job"
+```
+### Output Options
+#### Pretty Print Results
+```bash
+lineagentic analyze --agent-name sql --query "your code" --pretty
+```
+#### Save Results to File
+```bash
+lineagentic analyze --agent-name sql --query "your code" --output results.json
+```
+#### Save Results with Pretty Formatting
+```bash
+lineagentic analyze --agent-name python --query "your code" --output results.json --pretty
+```
+#### Enable Verbose Output
+```bash
+lineagentic analyze --agent-name sql --query "your code" --verbose
+```
+## Available Agents
+- **sql-lineage-agent**: Analyzes SQL queries and scripts (default)
+- **airflow-lineage-agent**: Analyzes Apache Airflow DAGs and workflows
+- **spark-lineage-agent**: Analyzes Apache Spark jobs
+- **python-lineage-agent**: Analyzes Python data processing scripts
+- **java-lineage-agent**: Analyzes Java data processing code
+## Commands
+### `analyze` Command
+Analyzes a query or code for lineage information.
+#### Required Arguments
+- Either `--query` or `--query-file` must be specified
+### Basic Query Analysis
+```bash
+# Simple SQL query analysis
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT user_id, name FROM users WHERE active = true"
+# Analyze with specific agent
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT a, b FROM table1 JOIN table2 ON table1.id = table2.id"
+# Analyze Python code
+lineagentic analyze --agent-name python-lineage-agent --query "import pandas as pd; df = pd.read_csv('data.csv'); result = df.groupby('category').sum()"
+# Analyze Java code
+lineagentic analyze --agent-name java-lineage-agent --query "public class DataProcessor { public void processData() { // processing logic } }"
+# Analyze Spark code
+lineagentic analyze --agent-name spark-lineage-agent --query "val df = spark.read.csv('data.csv'); val result = df.groupBy('category').agg(sum('value'))"
+# Analyze Airflow DAG
+lineagentic analyze --agent-name airflow-lineage-agent --query "from airflow import DAG; from airflow.operators.python import PythonOperator; dag = DAG('my_dag')"
+```
+### Reading from File
+```bash
+# Analyze query from file
+lineagentic analyze --agent-name sql-lineage-agent --query-file "queries/user_analysis.sql"
+# Analyze Python script from file
+lineagentic analyze --agent-name python-lineage-agent --query-file "scripts/data_processing.py"
+```
+### Output Options
+```bash
+# Save results to file
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --output "results.json"
+# Pretty print results
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --pretty
+# Verbose output
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --verbose
+# Don't save to database
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --no-save
+# Don't save to Neo4j
+lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --no-neo4j
+```
+## Common Output Options
+Both commands support these output options:
+- `--output`: Output file path for results (JSON format)
+- `--pretty`: Pretty print the output
+- `--verbose`: Enable verbose output
+## Error Handling
+The CLI provides clear error messages for common issues:
+- Missing required arguments
+- File not found errors
+- Agent execution errors
+- Invalid agent names
+## Development
+To run the CLI in development mode:
+```bash
+python -m cli.main --help
+```
+To run a specific command:
+```bash
+python -m cli.main analyze --agent-name sql --query "SELECT 1" --pretty
+```

cli/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+CLI package for lineagentic framework.
+"""
+__version__ = "0.1.0"

cli/main.py ADDED Viewed

	@@ -0,0 +1,238 @@

+#!/usr/bin/env python3
+"""
+Main CLI entry point for lineagentic framework.
+"""
+import asyncio
+import argparse
+import sys
+import os
+import logging
+from pathlib import Path
+# Add the project root to the Python path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+from lf_algorithm.framework_agent import FrameworkAgent
+def configure_logging(verbose: bool = False, quiet: bool = False):
+    """Configure logging for the CLI application."""
+    if quiet:
+        # Quiet mode: only show errors
+        logging.basicConfig(
+            level=logging.ERROR,
+            format='%(levelname)s: %(message)s'
+        )
+    elif verbose:
+        # Verbose mode: show all logs with detailed format
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+    else:
+        # Normal mode: show only important logs with clean format
+        logging.basicConfig(
+            level=logging.WARNING,  # Only show warnings and errors by default
+            format='%(levelname)s: %(message)s'
+        )
+        # Set specific loggers to INFO level for better user experience
+        logging.getLogger('lf_algorithm').setLevel(logging.INFO)
+        logging.getLogger('lf_algorithm.framework_agent').setLevel(logging.INFO)
+        logging.getLogger('lf_algorithm.agent_manager').setLevel(logging.INFO)
+    # Suppress noisy server logs from MCP tools
+    logging.getLogger('mcp').setLevel(logging.WARNING)
+    logging.getLogger('agents.mcp').setLevel(logging.WARNING)
+    logging.getLogger('agents.mcp.server').setLevel(logging.WARNING)
+    logging.getLogger('agents.mcp.server.stdio').setLevel(logging.WARNING)
+    logging.getLogger('agents.mcp.server.stdio.stdio').setLevel(logging.WARNING)
+    # Suppress MCP library logs specifically
+    logging.getLogger('mcp.server').setLevel(logging.WARNING)
+    logging.getLogger('mcp.server.fastmcp').setLevel(logging.WARNING)
+    logging.getLogger('mcp.server.stdio').setLevel(logging.WARNING)
+    # Suppress any logger that contains 'server' in the name
+    for logger_name in logging.root.manager.loggerDict:
+        if 'server' in logger_name.lower():
+            logging.getLogger(logger_name).setLevel(logging.WARNING)
+    # Additional MCP-specific suppressions
+    logging.getLogger('mcp.server.stdio.stdio').setLevel(logging.WARNING)
+    logging.getLogger('mcp.server.stdio.stdio.stdio').setLevel(logging.WARNING)
+def create_parser():
+    """Create and configure the argument parser."""
+    parser = argparse.ArgumentParser(
+        description="Lineagentic - Agentic approach for code analysis and lineage extraction",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  lineagentic analyze --agent-name sql-lineage-agent --query "SELECT a,b FROM table1"
+  lineagentic analyze --agent-name python-lineage-agent --query-file "my_script.py"
+        """
+    )
+    # Create subparsers for the two main operations
+    subparsers = parser.add_subparsers(dest='command', help='Available commands')
+    # Analyze query subparser
+    analyze_parser = subparsers.add_parser('analyze', help='Analyze code or query for lineage information')
+    analyze_parser.add_argument(
+        "--agent-name",
+        type=str,
+        default="sql",
+        help="Name of the agent to use (e.g., sql, airflow, spark, python, java) (default: sql)"
+    )
+    analyze_parser.add_argument(
+        "--model-name",
+        type=str,
+        default="gpt-4o-mini",
+        help="Model to use for the agents (default: gpt-4o-mini)"
+    )
+    analyze_parser.add_argument(
+        "--query",
+        type=str,
+        help="Code or query to analyze"
+    )
+    analyze_parser.add_argument(
+        "--query-file",
+        type=str,
+        help="Path to file containing the query/code to analyze"
+    )
+    # Common output options
+    analyze_parser.add_argument(
+        "--output",
+        type=str,
+        help="Output file path for results (JSON format)"
+    )
+    analyze_parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty print the output"
+    )
+    analyze_parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output with detailed logging"
+    )
+    analyze_parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress all output except errors"
+    )
+    return parser
+def read_query_file(file_path: str) -> str:
+    """Read query from a file."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    except FileNotFoundError:
+        print(f"Error: File '{file_path}' not found.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error reading file '{file_path}': {e}")
+        sys.exit(1)
+def save_output(result, output_file: str = None, pretty: bool = False):
+    """Save or print the result."""
+    # Convert AgentResult to dict if needed
+    if hasattr(result, 'to_dict'):
+        result_dict = result.to_dict()
+    else:
+        result_dict = result
+    if output_file:
+        import json
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(result_dict, f, indent=2 if pretty else None)
+        print(f"Results saved to '{output_file}'")
+    else:
+        if pretty:
+            import json
+            print("\n" + "="*50)
+            print("ANALYSIS RESULTS")
+            print("="*50)
+            print(json.dumps(result_dict, indent=2))
+            print("="*50)
+        else:
+            print("\nResults:", result_dict)
+async def run_analyze_query(args):
+    """Run analyze_query operation."""
+    logger = logging.getLogger(__name__)
+    # Get the query
+    query = args.query
+    if args.query_file:
+        query = read_query_file(args.query_file)
+    if not query:
+        logger.error("Either --query or --query-file must be specified.")
+        sys.exit(1)
+    logger.info(f"Running agent '{args.agent_name}' with query...")
+    try:
+        # Create FrameworkAgent instance
+        agent = FrameworkAgent(
+            agent_name=args.agent_name,
+            model_name=args.model_name,
+            source_code=query
+        )
+        # Run the agent
+        result = await agent.run_agent()
+        save_output(result, args.output, args.pretty)
+    except Exception as e:
+        logger.error(f"Error running agent '{args.agent_name}': {e}")
+        sys.exit(1)
+async def main_async():
+    """Main CLI function."""
+    parser = create_parser()
+    args = parser.parse_args()
+    # Check if a command was provided
+    if not args.command:
+        parser.print_help()
+        sys.exit(1)
+    # Configure logging based on verbosity
+    configure_logging(verbose=args.verbose, quiet=args.quiet)
+    # Run the appropriate command
+    if args.command == 'analyze':
+        await run_analyze_query(args)
+    else:
+        print(f"Unknown command: {args.command}")
+        sys.exit(1)
+def main():
+    """Synchronous wrapper for the async main function."""
+    asyncio.run(main_async())
+if __name__ == "__main__":
+    main()

demo_server.py ADDED Viewed

	@@ -0,0 +1,321 @@

+import gradio as gr
+import asyncio
+import json
+import threading
+import time
+import sys
+import os
+import logging
+from typing import Optional, Dict, Any
+from datetime import datetime
+# Import from the published package
+from lf_algorithm import FrameworkAgent
+from lf_algorithm.utils import write_lineage_log
+# Configure logging for the demo server
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+class SQLLineageFrontend:
+    def __init__(self):
+        self.agent_framework = None
+        self.current_results = None
+        self.current_agent_name = None
+        self.log_thread = None
+        self.should_stop_logging = False
+        self.logger = logging.getLogger(__name__)
+    def get_visualize_link(self) -> str:
+        """Generate JSONCrack visualization interface for aggregation data"""
+        if self.current_results is None:
+            return """
+            <div style='text-align: center; padding: 20px; color: #868e96;'>
+                <div style='font-size: 16px; margin-bottom: 15px;'>📊 Visualization Ready</div>
+                <div style='font-size: 14px; margin-bottom: 20px;'>
+                    After you run analysis and succeed, you need to got to the following link:<br>
+                </div>
+                <a href='https://jsoncrack.com/editor' target='_blank' style='color: #007bff; text-decoration: none; font-weight: bold; font-size: 16px;'>
+                    🔗 Open editor for simple check and paste the results there
+                </a>
+            </div>
+            """
+        try:
+            # Get the aggregation data - now it's directly the current_results
+            aggregation_data = self.current_results
+            # Handle different result types
+            if isinstance(aggregation_data, str):
+                try:
+                    # Try to parse as JSON first
+                    parsed_data = json.loads(aggregation_data)
+                    data_to_encode = parsed_data
+                except json.JSONDecodeError:
+                    # If it's not valid JSON, wrap it in a dict
+                    data_to_encode = {"aggregation_output": aggregation_data}
+            elif hasattr(aggregation_data, 'to_dict'):
+                # Handle AgentResult objects
+                data_to_encode = aggregation_data.to_dict()
+            elif isinstance(aggregation_data, dict):
+                data_to_encode = aggregation_data
+            else:
+                # Fallback for other object types
+                data_to_encode = {"aggregation_output": str(aggregation_data)}
+            # Format JSON for display
+            formatted_json = json.dumps(data_to_encode, indent=2)
+            return f"""
+            <div style='text-align: center; padding: 10px;'>
+                <div style='color: #28a745; font-size: 16px; margin-bottom: 15px; font-weight: bold;'>
+                    ✅ Analysis Complete! Ready for Visualization
+                </div>
+                <div style='color: #007bff; font-size: 14px; margin-bottom: 20px;'>
+                    📋 Steps to visualize your results:<br>
+                    1. Click "Open JSONCrack Editor" below<br>
+                    2. Click "Copy JSON" button or click the JSON data below to select all<br>
+                    3. Paste it into the JSONCrack editor
+                </div>
+                <a href='https://jsoncrack.com/editor' target='_blank' style='color: #007bff; text-decoration: none; font-weight: bold; font-size: 16px; padding: 10px 20px; border: 2px solid #007bff; border-radius: 5px; display: inline-block; margin-bottom: 15px;'>
+                    🔗 Open JSONCrack Editor
+                </a>
+                <br><br>
+                <div style='background: #f8f9fa; border: 1px solid #e0e0e0; border-radius: 5px; padding: 15px; margin: 10px 0;'>
+                    <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;'>
+                        <div style='font-weight: bold; color: #333;'>📄 Analysis Results (JSON)</div>
+                        <button onclick="document.getElementById('json-textarea').select(); document.getElementById('json-textarea').setSelectionRange(0, 99999); navigator.clipboard.writeText(document.getElementById('json-textarea').value).then(() => alert('JSON copied to clipboard!')).catch(() => alert('Failed to copy. Please select and copy manually.'));" style='background: #28a745; color: white; border: none; padding: 8px 16px; border-radius: 4px; cursor: pointer; font-weight: bold; width: 120px;'>📋 Copy JSON</button>
+                    </div>
+                    <textarea id="json-textarea" readonly style='background: #ffffff; color: #000000; padding: 12px; border-radius: 3px; border: 1px solid #e0e0e0; font-family: monospace; font-size: 12px; width: 100%; height: 250px; resize: vertical; cursor: text;' onclick="this.select(); this.setSelectionRange(0, 99999);" title="Click to select all JSON">{formatted_json}</textarea>
+                </div>
+            </div>
+            """
+        except Exception as e:
+            return f"<div style='color: #ff6b6b;'>❌ Error generating visualization data: {str(e)}</div>"
+    def get_logs_html(self) -> str:
+        """Generate HTML for live logs display"""
+        if self.current_agent_name is None:
+            return "<div style='color: #868e96;'>No agent initialized yet</div>"
+        return f"""<div style='background: #f8f9fa; border: 1px solid #e0e0e0; border-radius: 5px; padding: 15px;'>
+            <div style='color: #28a745; font-weight: bold; margin-bottom: 10px;'>
+                📝 Logging Status for Agent: {self.current_agent_name}
+            </div>
+            <div style='color: #6c757d; font-size: 14px; line-height: 1.5;'>
+                ✅ <strong>Standard Python Logging Active</strong><br>
+                • All logs are being captured by the application's logging system<br>
+                • Check your console/terminal for real-time log output<br>
+                • Logs include detailed information about agent execution<br>
+                • Structured logging with timestamps and log levels<br><br>
+                📋 <strong>Log Types Available:</strong><br>
+                • <span style='color: #007bff;'>INFO</span> - General information and progress<br>
+                • <span style='color: #28a745;'>DEBUG</span> - Detailed debugging information<br>
+                • <span style='color: #ffc107;'>WARNING</span> - Warning messages<br>
+                • <span style='color: #dc3545;'>ERROR</span> - Error messages<br><br>
+                🔍 <strong>What You'll See:</strong><br>
+                • Agent initialization and configuration<br>
+                • MCP tool interactions and responses<br>
+                • Analysis progress and completion status<br>
+                • Any errors or warnings during execution
+            </div>
+        </div>"""
+    def test_log_writing(self):
+        """Test function to write a sample log entry"""
+        if self.current_agent_name:
+            try:
+                write_lineage_log(self.current_agent_name, "test", "Test log entry from frontend")
+                self.logger.info(f"Test log written successfully for agent: {self.current_agent_name}")
+                return f"✅ Test log written successfully for agent: {self.current_agent_name}! Check your console output."
+            except Exception as e:
+                self.logger.error(f"Failed to write test log: {e}")
+                return f"❌ Failed to write test log: {e}"
+        else:
+            return "⚠️ Please initialize an agent first by running an analysis"
+    def get_results_info(self) -> str:
+        """Get information about the current results"""
+        if self.current_results is None:
+            return "No results available yet"
+        if isinstance(self.current_results, dict) and "error" in self.current_results:
+            return f"Error in results: {self.current_results['error']}"
+        if hasattr(self.current_results, 'to_dict'):
+            # AgentResult object
+            result_dict = self.current_results.to_dict()
+            inputs_count = len(result_dict.get('inputs', []))
+            outputs_count = len(result_dict.get('outputs', []))
+            return f"✅ Structured results with {inputs_count} input(s) and {outputs_count} output(s)"
+        if isinstance(self.current_results, dict):
+            return f"✅ Dictionary results with {len(self.current_results)} keys"
+        return f"✅ Results type: {type(self.current_results).__name__}"
+    async def run_analysis(self, agent_name: str, model_name: str, query: str):
+        """Run SQL lineage analysis"""
+        try:
+            # Validate input
+            if not query or not query.strip():
+                return "❌ Error: Query cannot be empty. Please provide a valid query for analysis."
+            self.logger.info(f"Starting analysis with agent: {agent_name}, model: {model_name}")
+            # Initialize the agent framework with simplified constructor
+            self.agent_framework = FrameworkAgent(
+                agent_name=agent_name,
+                model_name=model_name,
+                source_code=query.strip()
+            )
+            self.current_agent_name = agent_name
+            self.logger.info(f"Agent framework initialized. Running analysis...")
+            # Run the analysis using the structured results method
+            results = await self.agent_framework.run_agent()
+            self.current_results = results
+            # Check if we got an error response
+            if isinstance(results, dict) and "error" in results:
+                self.logger.error(f"Analysis failed: {results['error']}")
+                return f"❌ Analysis failed: {results['error']}"
+            self.logger.info(f"Analysis completed successfully for agent: {agent_name}")
+            return f"""✅ Analysis completed successfully! Results are now available in the visualization section.
+            Click 'Open JSONCrack Editor' to visualize your data lineage.
+            If you want to set up your own local development environment or deploy this in production,
+            please refer to the GitHub repository mentioned above."""
+        except ValueError as ve:
+            self.logger.error(f"Validation error: {ve}")
+            return f"❌ Validation error: {str(ve)}"
+        except Exception as e:
+            self.logger.error(f"Error running analysis: {e}")
+            return f"❌ Error running analysis: {str(e)}"
+    def run_analysis_sync(self, agent_name: str, model_name: str, query: str):
+        """Synchronous wrapper for run_analysis"""
+        return asyncio.run(self.run_analysis(agent_name, model_name, query))
+    def create_ui(self):
+        """Create the Gradio interface"""
+        with gr.Blocks(title="SQL Lineage Analysis", fill_width=True) as ui:
+            gr.Markdown('<div style="text-align: center;font-size:24px">🔍 Demo Lineagentic-Flow</div>')
+            gr.Markdown('<div style="text-align: center;font-size:14px">Analyze data lineage with AI-powered agents</div>')
+            gr.Markdown('<div style="text-align: center;font-size:14px">Check out agent types for supporting script types</div>')
+            gr.Markdown('<div style="text-align: center;font-size:14px">For local and production runs, check out the repo: <a href="https://github.com/lineagentic/lineagentic-flow" target="_blank" style="color: #007bff; text-decoration: none; font-weight: bold;">🔗 https://github.com/lineagentic/lineagentic-flow</a></div>')
+            with gr.Row():
+                # Left column - Configuration and Query
+                with gr.Column(scale=1):
+                    gr.Markdown("### 1. Agent Configuration")
+                    agent_dropdown = gr.Dropdown(
+                        label="Agent Type",
+                        choices=[
+                            "sql-lineage-agent",
+                            "python-lineage-agent",
+                            "airflow-lineage-agent",
+                            "java-lineage-agent",
+                            "spark-lineage-agent"
+                        ],
+                        value="sql-lineage-agent"
+                    )
+                    model_dropdown = gr.Dropdown(
+                        label="Model",
+                        choices=[
+                            "gpt-4o-mini",
+                            "gpt-4o",
+                            "deepseek-coder",
+                            "deepseek-chat",
+                            "gemini-pro"
+                        ],
+                        value="gpt-4o-mini"
+                    )
+                    gr.Markdown("### 2. Query for Lineage Analysis")
+                    query_input = gr.Textbox(
+                        label="Query",
+                        placeholder="Enter your SQL query here...",
+                        lines=9,
+                        max_lines=15
+                    )
+                    analyze_button = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
+                    status_output = gr.Textbox(label="Status", interactive=False)
+                # Right column - Visualization and Logs
+                with gr.Column(scale=1):
+                    gr.Markdown("### 3. Results Information")
+                    results_info = gr.Textbox(
+                        label="Results Status",
+                        value=self.get_results_info(),
+                        interactive=False
+                    )
+                    gr.Markdown("### 4. Visualize Results")
+                    gr.Markdown("📊 After successful analysis, visualize your results in demo editor")
+                    visualize_html = gr.HTML(self.get_visualize_link())
+                    gr.Markdown("### 5. Logging Information")
+                    logs_html = gr.HTML(self.get_logs_html())
+                    test_log_button = gr.Button("Test Log Writing", variant="secondary", size="sm")
+                    # Auto-refresh logs every 5 seconds
+                    refresh_logs = gr.Button("🔄 Refresh Logs", variant="secondary", size="sm")
+                    refresh_results = gr.Button("🔄 Refresh Results Info", variant="secondary", size="sm")
+            # Event handlers
+            def run_analysis_and_update(agent_name, model_name, query):
+                """Run analysis and update visualization"""
+                # Run the analysis
+                status_result = self.run_analysis_sync(agent_name, model_name, query)
+                # Update visualization, logs, and results info
+                viz_html = self.get_visualize_link()
+                logs_html = self.get_logs_html()
+                results_info = self.get_results_info()
+                return status_result, results_info, viz_html, logs_html
+            analyze_button.click(
+                fn=run_analysis_and_update,
+                inputs=[agent_dropdown, model_dropdown, query_input],
+                outputs=[status_output, results_info, visualize_html, logs_html]
+            )
+            test_log_button.click(
+                fn=self.test_log_writing,
+                inputs=[],
+                outputs=[status_output]
+            )
+            refresh_logs.click(
+                fn=self.get_logs_html,
+                inputs=[],
+                outputs=[logs_html]
+            )
+            refresh_results.click(
+                fn=self.get_results_info,
+                inputs=[],
+                outputs=[results_info]
+            )
+        return ui
+    def run(self):
+        """Launch the Gradio interface"""
+        ui = self.create_ui()
+        ui.launch(share=False, inbrowser=True)
+if __name__ == "__main__":
+    frontend = SQLLineageFrontend()
+    frontend.run()

deploy_setup.py ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/usr/bin/env python3
+"""
+Deployment setup script for Hugging Face Spaces
+This script installs the local package after all files are copied
+"""
+import subprocess
+import sys
+import os
+def install_local_package():
+    """Install the local package in editable mode"""
+    try:
+        print("📦 Installing local lineagentic-flow package...")
+        # First, try to install in editable mode
+        result = subprocess.run([
+            sys.executable, "-m", "pip", "install", "-e", "."
+        ], capture_output=True, text=True, cwd=os.getcwd())
+        if result.returncode == 0:
+            print("✅ Local package installed successfully!")
+            # Verify that entry points are registered
+            try:
+                import importlib.metadata
+                entry_points = list(importlib.metadata.entry_points(group='lineagentic.lf_algorithm.plugins'))
+                print(f"✅ Found {len(entry_points)} registered plugins:")
+                for ep in entry_points:
+                    print(f"   - {ep.name}")
+                return True
+            except Exception as e:
+                print(f"⚠️ Warning: Could not verify entry points: {e}")
+                return True
+        else:
+            print(f"❌ Failed to install local package: {result.stderr}")
+            return False
+    except Exception as e:
+        print(f"❌ Error installing local package: {e}")
+        return False
+if __name__ == "__main__":
+    install_local_package()

lf_algorithm/__init__.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# lf_algorithm/__init__.py
+import logging
+# Add NullHandler to prevent "No handler could be found" warnings
+# This is the only logging configuration the library should do
+logging.getLogger(__name__).addHandler(logging.NullHandler())
+from .framework_agent import FrameworkAgent
+from .utils import write_lineage_log
+from .utils.file_utils import dump_json_record, read_json_records, clear_json_file, get_file_stats
+from .utils.tracers import LogTracer, log_trace_id
+from .models.models import AgentResult
+from .plugins.sql_lineage_agent.lineage_agent import SqlLineageAgent, create_sql_lineage_agent, get_plugin_info as get_sql_plugin_info
+from .plugins.python_lineage_agent.lineage_agent import PythonLineageAgent, create_python_lineage_agent, get_plugin_info as get_python_plugin_info
+from .plugins.airflow_lineage_agent.lineage_agent import AirflowLineageAgent, create_airflow_lineage_agent, get_plugin_info as get_airflow_plugin_info
+from .plugins.java_lineage_agent.lineage_agent import JavaLineageAgent, create_java_lineage_agent, get_plugin_info as get_java_plugin_info
+from .plugins.spark_lineage_agent.lineage_agent import SparkLineageAgent, create_spark_lineage_agent, get_plugin_info as get_spark_plugin_info
+__version__ = "0.1.0"
+__all__ = [
+    'FrameworkAgent',
+    'AgentResult',
+    'write_lineage_log',
+    'dump_json_record',
+    'read_json_records',
+    'clear_json_file',
+    'get_file_stats',
+    'LogTracer',
+    'log_trace_id',
+    'SqlLineageAgent',
+    'create_sql_lineage_agent',
+    'get_sql_plugin_info',
+    'PythonLineageAgent',
+    'create_python_lineage_agent',
+    'get_python_plugin_info',
+    'AirflowLineageAgent',
+    'create_airflow_lineage_agent',
+    'get_airflow_plugin_info',
+    'JavaLineageAgent',
+    'create_java_lineage_agent',
+    'get_java_plugin_info',
+    'SparkLineageAgent',
+    'create_spark_lineage_agent',
+    'get_spark_plugin_info'
+]

lf_algorithm/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.75 kB). View file

lf_algorithm/__pycache__/agent_manager.cpython-313.pyc ADDED Viewed

Binary file (4.56 kB). View file

lf_algorithm/__pycache__/framework_agent.cpython-313.pyc ADDED Viewed

Binary file (5.79 kB). View file

lf_algorithm/agent_manager.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import importlib.metadata
+from typing import Dict, Any, Optional, Type, Callable
+from .utils import get_logger, get_model, validate_api_keys
+logger = get_logger(__name__)
+class AgentManager:
+    """Manages plugin discovery and loading for the FrameworkAgent"""
+    def __init__(self):
+        self.agents: Dict[str, Dict[str, Any]] = {}
+        self.agent_factories: Dict[str, Callable] = {}
+        self._load_plugins()
+        # Validate API keys on initialization
+        validate_api_keys()
+    def _load_plugins(self):
+        """Load all available agents plugins using entry points"""
+        try:
+            # Load plugins from the 'lineagentic.lf_algorithm.plugins' entry point group
+            for entry_point in importlib.metadata.entry_points(group='lineagentic.lf_algorithm.plugins'):
+                try:
+                    agent_info = entry_point.load()
+                    if callable(agent_info):
+                        # If it's a function, assume it returns plugin info
+                        agent_data = agent_info()
+                    else:
+                        # If it's already a dict/object
+                        agent_data = agent_info
+                    agent_name = agent_data.get('name', entry_point.name)
+                    self.agents[agent_name] = agent_data
+                    # Store the factory function if available
+                    if 'factory_function' in agent_data:
+                        self.agent_factories[agent_name] = agent_data['factory_function']
+                    logger.info(f"Loaded plugin: {agent_name}")
+                except Exception as e:
+                    logger.error(f"Failed to load plugin {entry_point.name}: {e}")
+        except Exception as e:
+            logger.error(f"Error loading plugins: {e}")
+    def get_agent(self, agent_name: str) -> Optional[Dict[str, Any]]:
+        """Get agent information by name"""
+        return self.agents.get(agent_name)
+    def list_agents(self) -> Dict[str, Dict[str, Any]]:
+        """List all available agents"""
+        return self.agents.copy()
+    def create_agent(self, agent_name: str, **kwargs) -> Any:
+        """Create an agent instance using the agent's factory function"""
+        if agent_name not in self.agent_factories:
+            raise ValueError(f"Agent '{agent_name}' not found or has no factory function")
+        factory = self.agent_factories[agent_name]
+        # Pass the get_model function to the agent factory
+        kwargs['get_model_func'] = get_model
+        return factory(agent_name=agent_name, **kwargs)
+    def get_supported_operations(self) -> Dict[str, list]:
+        """Get all supported operations from all agents"""
+        operations = {}
+        for agent_name, agent_info in self.agents.items():
+            supported_ops = agent_info.get('supported_operations', [])
+            for op in supported_ops:
+                if op not in operations:
+                    operations[op] = []
+                operations[op].append(agent_name)
+        return operations
+    def get_agents_for_operation(self, operation: str) -> list:
+        """Get all agents that support a specific operation"""
+        supported_ops = self.get_supported_operations()
+        return supported_ops.get(operation, [])
+# Global agent manager instance
+agent_manager = AgentManager()

lf_algorithm/framework_agent.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import asyncio
+import sys
+import os
+from typing import Dict, Any, List, Optional, Union
+import json
+from datetime import datetime
+import uuid
+from .utils import get_logger, get_model, validate_api_keys
+logger = get_logger(__name__)
+from .utils.tracers import LogTracer
+from .agent_manager import agent_manager
+from agents import add_trace_processor
+from .models.models import AgentResult
+class FrameworkAgent:
+    def __init__(self, agent_name: str, model_name: str = "gpt-4o-mini",
+                 source_code: str = None):
+        """
+        Initialize the Agent Framework.
+        Args:
+            agent_name (str): The name of the agent to use
+            model_name (str): The model to use for the agents (default: "gpt-4o-mini")
+            lineage_config (LineageConfig): Configuration for OpenLineage event metadata
+        Raises:
+            ValueError: If lineage_config is not provided
+        """
+        if not source_code:
+            raise ValueError("source_code is required and cannot be None")
+        self.agent_name = agent_name
+        self.model_name = model_name
+        self.source_code = source_code
+        self.agent_manager = agent_manager
+        # Validate API keys on initialization
+        validate_api_keys()
+        logger.info(f"FrameworkAgent initialized: agent_name={agent_name}, model_name={model_name}")
+    async def run_agent_plugin(self, **kwargs) -> Dict[str, Any]:
+        """
+        Run a specific agent with a source code.
+        Args:
+            **kwargs: Additional arguments to pass to the agent
+        Returns:
+            Dict[str, Any]: The results from the agent with merged OpenLineage metadata
+        """
+        logger.info(f"Starting agent: {self.agent_name} with model: {self.model_name}")
+        add_trace_processor(LogTracer())
+        try:
+            # Create the agent using the plugin's factory function
+            logger.info(f"Creating agent instance for: {self.agent_name}")
+            agent = self.agent_manager.create_agent(
+                agent_name=self.agent_name,
+                source_code=self.source_code,
+                model_name=self.model_name,
+                **kwargs
+            )
+            # Run the agent
+            logger.info(f"Running agent: {self.agent_name}")
+            results = await agent.run()
+            logger.info(f"Agent {self.agent_name} completed successfully")
+            return results
+        except Exception as e:
+            logger.error(f"Error running agent {self.agent_name}: {e}")
+            return {"error": str(e)}
+    def map_results_to_objects(self, results: Dict[str, Any]) -> Union[AgentResult, Dict[str, Any]]:
+        """
+        Map JSON results from agent to structured AgentResult objects.
+        Args:
+            results: Dictionary containing the agent results
+        Returns:
+            AgentResult: Structured object representation of the results, or original dict if mapping fails
+        """
+        try:
+            # Check if it's an error response
+            if "error" in results:
+                return results
+            # Check if it has the expected structure for lineage results
+            if "inputs" in results and "outputs" in results:
+                return AgentResult.from_dict(results)
+            # If it doesn't match the expected structure, return as-is
+            return results
+        except Exception as e:
+            logger.error(f"Error mapping results to objects: {e}")
+            return results
+    async def run_agent(self, **kwargs) -> Union[AgentResult, Dict[str, Any]]:
+        """
+        Run a specific agent and return structured objects instead of raw dictionaries.
+        Args:
+            **kwargs: Additional arguments to pass to the agent
+        Returns:
+            Union[AgentResult, Dict[str, Any]]: Structured AgentResult object or error dict
+        """
+        logger.info(f"Starting run_agent for {self.agent_name}")
+        raw_results = await self.run_agent_plugin(**kwargs)
+        mapped_results = self.map_results_to_objects(raw_results)
+        logger.info(f"Agent {self.agent_name} completed. Results type: {type(mapped_results)}")
+        if hasattr(mapped_results, 'to_dict'):
+            logger.info(f"Mapped results: {mapped_results.to_dict()}")
+        else:
+            logger.info(f"Raw results: {mapped_results}")
+        return mapped_results

lf_algorithm/models/__pycache__/models.cpython-313.pyc ADDED Viewed

Binary file (16.1 kB). View file

lf_algorithm/models/models.py ADDED Viewed

	@@ -0,0 +1,285 @@

+"""
+Agent result models for mapping JSON responses from lineage agents.
+This module contains classes for representing the structured results returned
+by lineage analysis agents in a type-safe manner.
+"""
+from typing import Dict, Any, List, Optional
+class SchemaField:
+    """Schema field configuration for agent results"""
+    def __init__(self, name: str, type: str, description: str):
+        self.name = name
+        self.type = type
+        self.description = description
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'SchemaField':
+        """Create SchemaField from dictionary"""
+        return cls(
+            name=data.get('name', ''),
+            type=data.get('type', ''),
+            description=data.get('description', '')
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            'name': self.name,
+            'type': self.type,
+            'description': self.description
+        }
+class Schema:
+    """Schema configuration for agent results"""
+    def __init__(self, fields: List[SchemaField]):
+        self.fields = fields
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'Schema':
+        """Create Schema from dictionary"""
+        fields = [SchemaField.from_dict(field) for field in data.get('fields', [])]
+        return cls(fields=fields)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            'fields': [field.to_dict() for field in self.fields]
+        }
+class Transformation:
+    """Transformation configuration for column lineage"""
+    def __init__(self, type: str, subtype: str, description: str, masking: bool = False):
+        self.type = type
+        self.subtype = subtype
+        self.description = description
+        self.masking = masking
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'Transformation':
+        """Create Transformation from dictionary"""
+        return cls(
+            type=data.get('type', ''),
+            subtype=data.get('subtype', ''),
+            description=data.get('description', ''),
+            masking=data.get('masking', False)
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            'type': self.type,
+            'subtype': self.subtype,
+            'description': self.description,
+            'masking': self.masking
+        }
+class InputField:
+    """Input field configuration for column lineage"""
+    def __init__(self, namespace: str, name: str, field: str,
+                 transformations: List[Transformation]):
+        self.namespace = namespace
+        self.name = name
+        self.field = field
+        self.transformations = transformations
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'InputField':
+        """Create InputField from dictionary"""
+        transformations = [Transformation.from_dict(t) for t in data.get('transformations', [])]
+        return cls(
+            namespace=data.get('namespace', ''),
+            name=data.get('name', ''),
+            field=data.get('field', ''),
+            transformations=transformations
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            'namespace': self.namespace,
+            'name': self.name,
+            'field': self.field,
+            'transformations': [t.to_dict() for t in self.transformations]
+        }
+class ColumnLineageField:
+    """Column lineage field configuration"""
+    def __init__(self, input_fields: List[InputField]):
+        self.input_fields = input_fields
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ColumnLineageField':
+        """Create ColumnLineageField from dictionary"""
+        input_fields = [InputField.from_dict(field) for field in data.get('inputFields', [])]
+        return cls(input_fields=input_fields)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            'inputFields': [field.to_dict() for field in self.input_fields]
+        }
+class ColumnLineage:
+    """Column lineage configuration"""
+    def __init__(self, fields: Dict[str, ColumnLineageField]):
+        self.fields = fields
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ColumnLineage':
+        """Create ColumnLineage from dictionary"""
+        fields = {
+            field_name: ColumnLineageField.from_dict(field_data)
+            for field_name, field_data in data.get('fields', {}).items()
+        }
+        return cls(fields=fields)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            'fields': {
+                field_name: field_data.to_dict()
+                for field_name, field_data in self.fields.items()
+            }
+        }
+class InputFacets:
+    """Input facets configuration for agent results"""
+    def __init__(self, schema: Optional[Schema] = None):
+        self.schema = schema
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'InputFacets':
+        """Create InputFacets from dictionary"""
+        schema = Schema.from_dict(data.get('schema', {})) if data.get('schema') else None
+        return cls(schema=schema)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        result = {}
+        if self.schema:
+            result['schema'] = self.schema.to_dict()
+        return result
+class Input:
+    """Input configuration for agent results"""
+    def __init__(self, namespace: str, name: str, facets: Optional[InputFacets] = None):
+        self.namespace = namespace
+        self.name = name
+        self.facets = facets
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'Input':
+        """Create Input from dictionary"""
+        facets = InputFacets.from_dict(data.get('facets', {})) if data.get('facets') else None
+        return cls(
+            namespace=data.get('namespace', ''),
+            name=data.get('name', ''),
+            facets=facets
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        result = {
+            'namespace': self.namespace,
+            'name': self.name
+        }
+        if self.facets:
+            result['facets'] = self.facets.to_dict()
+        return result
+class OutputFacets:
+    """Output facets configuration for agent results"""
+    def __init__(self, column_lineage: Optional[ColumnLineage] = None):
+        self.column_lineage = column_lineage
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'OutputFacets':
+        """Create OutputFacets from dictionary"""
+        column_lineage = ColumnLineage.from_dict(data.get('columnLineage', {})) if data.get('columnLineage') else None
+        return cls(column_lineage=column_lineage)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        result = {}
+        if self.column_lineage:
+            result['columnLineage'] = self.column_lineage.to_dict()
+        return result
+class Output:
+    """Output configuration for agent results"""
+    def __init__(self, namespace: str, name: str, facets: Optional[OutputFacets] = None):
+        self.namespace = namespace
+        self.name = name
+        self.facets = facets
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'Output':
+        """Create Output from dictionary"""
+        facets = OutputFacets.from_dict(data.get('facets', {})) if data.get('facets') else None
+        return cls(
+            namespace=data.get('namespace', ''),
+            name=data.get('name', ''),
+            facets=facets
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        result = {
+            'namespace': self.namespace,
+            'name': self.name
+        }
+        if self.facets:
+            result['facets'] = self.facets.to_dict()
+        return result
+class AgentResult:
+    """Main result class for agent lineage analysis"""
+    def __init__(self, inputs: List[Input], outputs: List[Output]):
+        self.inputs = inputs
+        self.outputs = outputs
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'AgentResult':
+        """Create AgentResult from dictionary"""
+        inputs = [Input.from_dict(input_data) for input_data in data.get('inputs', [])]
+        outputs = [Output.from_dict(output_data) for output_data in data.get('outputs', [])]
+        return cls(inputs=inputs, outputs=outputs)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            'inputs': [input_obj.to_dict() for input_obj in self.inputs],
+            'outputs': [output_obj.to_dict() for output_obj in self.outputs]
+        }
+    def __str__(self) -> str:
+        """String representation"""
+        return f"AgentResult(inputs={len(self.inputs)}, outputs={len(self.outputs)})"
+    def __repr__(self) -> str:
+        """Detailed string representation"""
+        return f"AgentResult(inputs={self.inputs}, outputs={self.outputs})"

lf_algorithm/plugins/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Plugin system for FrameworkAgent

lf_algorithm/plugins/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (187 Bytes). View file

lf_algorithm/plugins/airflow_lineage_agent/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

lf_algorithm/plugins/airflow_lineage_agent/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (209 Bytes). View file

lf_algorithm/plugins/airflow_lineage_agent/__pycache__/airflow_instructions.cpython-313.pyc ADDED Viewed

Binary file (5.21 kB). View file

lf_algorithm/plugins/airflow_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc ADDED Viewed

Binary file (5.95 kB). View file

lf_algorithm/plugins/airflow_lineage_agent/airflow_instructions.py ADDED Viewed

	@@ -0,0 +1,98 @@

+def comprehensive_analysis_instructions(name: str):
+    return f"""
+    You are the {name} Airflow lineage analysis agent.
+    **Your Task:** Perform complete Airflow DAG lineage analysis in a single comprehensive process.
+    **Complete Analysis Process:**
+    **Step 1: Syntax Analysis**
+    1. Call the airflow_lineage_syntax_analysis() MCP tool to get expert instructions
+    2. Follow those instructions exactly to analyze the Airflow DAG structure
+    3. Store the syntax analysis results for use in subsequent steps
+    **Step 2: Field Derivation**
+    1. Call the airflow_lineage_field_derivation() MCP tool to get expert instructions
+    2. Use the syntax analysis results from Step 1 to inform your field mapping analysis
+    3. Follow the MCP tool instructions exactly to analyze field mappings and transformations
+    4. Store the field derivation results
+    **Step 3: Operation Tracing**
+    1. Call the airflow_lineage_operation_tracing() MCP tool to get expert instructions
+    2. Use the syntax analysis results from Step 1 to inform your operation analysis
+    3. Follow the MCP tool instructions exactly to analyze logical operations and operators
+    4. Store the operation tracing results
+    **Step 4: Event Composition**
+    1. Call the airflow_lineage_event_composer() MCP tool to get expert instructions
+    2. Combine all previous analysis results (syntax, field derivation, operation tracing)
+    3. Follow the MCP tool instructions exactly to compose the final OpenLineage event
+    4. Return the complete OpenLineage event
+    **Important Guidelines:**
+    - Each MCP tool contains detailed instructions, examples, and output format requirements
+    - Follow the MCP tool instructions precisely for each step
+    - Maintain context between steps - use results from earlier steps to inform later analysis
+    - Ensure the final output is a complete, properly formatted OpenLineage event
+    - If any step fails, provide clear error information and stop the process
+    **Workflow Summary:**
+    Syntax Analysis → Field Derivation → Operation Tracing → Event Composition → Final Output
+    """
+# Keep the individual instructions for backward compatibility if needed
+def syntax_analysis_instructions(name: str):
+    return f"""
+    You are the {name} Airflow lineage analysis agent.
+    **Your Task:** Analyze the provided Airflow DAG for syntax structure.
+    **Process:**
+    1. Call the airflow_lineage_syntax_analysis() MCP tool to get expert instructions
+    2. Follow those instructions exactly to analyze the Airflow DAG
+    3. Return the analysis results in the format specified by the MCP tool
+    **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
+    """
+def field_derivation_instructions(name: str):
+    return f"""
+    You are the {name} Airflow lineage analysis agent.
+    **Your Task:** Analyze field mappings and transformations in the Airflow DAG.
+    **Process:**
+    1. Call the airflow_lineage_field_derivation() MCP tool to get expert instructions
+    2. Follow those instructions exactly to analyze field mappings
+    3. Return the analysis results in the format specified by the MCP tool
+    **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
+    """
+def operation_tracing_instructions(name: str):
+    return f"""
+    You are the {name} Airflow lineage analysis agent.
+    **Your Task:** Analyze logical operations and operators in the Airflow DAG.
+    **Process:**
+    1. Call the airflow_lineage_operation_tracing() MCP tool to get expert instructions
+    2. Follow those instructions exactly to analyze logical operations
+    3. Return the analysis results in the format specified by the MCP tool
+    **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
+    """
+def event_composer_instructions(name: str):
+    return f"""
+    You are the {name} Airflow lineage analysis agent.
+    **Your Task:** Compose OpenLineage events from the provided analysis data.
+    **Process:**
+    1. Call the airflow_lineage_event_composer() MCP tool to get expert instructions
+    2. Follow those instructions exactly to compose the OpenLineage event
+    3. Return the event in the format specified by the MCP tool
+    **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
+    """

lf_algorithm/plugins/airflow_lineage_agent/lineage_agent.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import sys
+import logging
+from contextlib import AsyncExitStack
+from agents import Agent, Tool, Runner, trace
+from agents.mcp.server import MCPServerStdio
+from typing import Dict, Any, Optional
+from ...utils.tracers import log_trace_id
+from ...plugins.airflow_lineage_agent.airflow_instructions import comprehensive_analysis_instructions
+from ...plugins.airflow_lineage_agent.mcp_servers.mcp_params import airflow_mcp_server_params
+from ...utils.file_utils import dump_json_record
+# Get logger for this module
+logger = logging.getLogger(__name__)
+MAX_TURNS = 30  # Increased for comprehensive analysis
+class AirflowLineageAgent:
+    """Plugin agent for Airflow lineage analysis"""
+    def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None):
+        self.agent_name = agent_name
+        self.model_name = model_name
+        self.source_code = source_code
+        self.get_model_func = get_model_func
+    async def create_agent(self, airflow_mcp_servers) -> Agent:
+        # Use the passed get_model_func or fall back to the centralized one
+        if self.get_model_func:
+            model = self.get_model_func(self.model_name)
+        else:
+            from ...utils import get_model
+            model = get_model(self.model_name)
+        agent = Agent(
+            name=self.agent_name,
+            instructions=comprehensive_analysis_instructions(self.agent_name),
+            model=model,
+            mcp_servers=airflow_mcp_servers,
+        )
+        return agent
+    async def run_agent(self, airflow_mcp_servers, source_code: str):
+        # Create single agent for comprehensive analysis
+        comprehensive_agent = await self.create_agent(airflow_mcp_servers)
+        # Run the complete analysis in one go
+        result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS)
+        # Return the final output
+        return dump_json_record(self.agent_name, result.final_output)
+    async def run_with_mcp_servers(self, source_code: str):
+        async with AsyncExitStack() as stack:
+            airflow_mcp_servers = [
+                await stack.enter_async_context(
+                    MCPServerStdio(params, client_session_timeout_seconds=120)
+                )
+                for params in airflow_mcp_server_params
+            ]
+            return await self.run_agent(airflow_mcp_servers, source_code=source_code)
+    async def run_with_trace(self, source_code: str):
+        trace_name = f"{self.agent_name}-lineage-agent"
+        trace_id = log_trace_id(f"{self.agent_name.lower()}")
+        with trace(trace_name, trace_id=trace_id):
+            return await self.run_with_mcp_servers(source_code=source_code)
+    async def run(self):
+        try:
+            logger.info(f"Starting Airflow lineage analysis for {self.agent_name}")
+            result = await self.run_with_trace(self.source_code)
+            logger.info(f"Completed Airflow lineage analysis for {self.agent_name}")
+            return result
+        except Exception as e:
+            logger.error(f"Error running {self.agent_name}: {e}")
+            return {"error": str(e)}
+# Plugin interface functions
+def create_airflow_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> AirflowLineageAgent:
+    """Factory function to create a AirflowLineageAgent instance"""
+    return AirflowLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func)
+def get_plugin_info() -> Dict[str, Any]:
+    """Return plugin metadata"""
+    return {
+        "name": "airflow-lineage-agent",
+        "description": "Airflow lineage analysis agent for parsing and analyzing Airflow queries",
+        "version": "1.0.0",
+        "author": "Ali Shamsaddinlou",
+        "agent_class": AirflowLineageAgent,
+        "factory_function": create_airflow_lineage_agent,
+        "supported_operations": ["lineage_analysis"],
+    }

lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__init__.py ADDED Viewed

File without changes

lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (221 Bytes). View file

lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc ADDED Viewed

Binary file (515 Bytes). View file

lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/__init__.py ADDED Viewed

File without changes

lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+# Configure logging to suppress verbose output
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger('mcp').setLevel(logging.WARNING)
+logging.getLogger('mcp.server').setLevel(logging.WARNING)
+from mcp.server.fastmcp import FastMCP
+from typing import Dict, Any
+mcp = FastMCP("lineage_airflow_server")
+from templates import (airflow_lineage_syntax_analysis as syntax_analysis_template,
+                       airflow_lineage_field_derivation as field_derivation_template,
+                       airflow_lineage_operation_tracing as operation_tracing_template,
+                       airflow_lineage_event_composer as event_composer_template)
+@mcp.tool()
+async def airflow_lineage_syntax_analysis() -> Dict[str, Any]:
+    """Airflow lineage structure and syntax decomposition expert"""
+    return {
+        "instructions": syntax_analysis_template(),
+        "version": "1.0.0",
+        "capabilities": ["dag_parsing", "task_extraction", "dependency_analysis"]
+    }
+@mcp.tool()
+async def airflow_lineage_field_derivation() -> Dict[str, Any]:
+    """Field mapping and field derivation expert"""
+    return {
+        "instructions": field_derivation_template(),
+        "version": "1.0.0",
+        "capabilities": ["field_mapping", "transformation_analysis", "column_lineage"]
+    }
+@mcp.tool()
+async def airflow_lineage_operation_tracing() -> Dict[str, Any]:
+    """Logical operator analysis and operation tracing expert"""
+    return {
+        "instructions": operation_tracing_template(),
+        "version": "1.0.0",
+        "capabilities": ["filter_analysis", "join_analysis", "aggregation_tracking"]
+    }
+@mcp.tool()
+async def airflow_lineage_event_composer() -> Dict[str, Any]:
+    """Event composition and aggregation expert"""
+    return {
+        "instructions": event_composer_template(),
+        "version": "1.0.0",
+        "capabilities": ["openlineage_generation", "event_composition", "metadata_aggregation"]
+    }
+if __name__ == "__main__":
+    mcp.run(transport='stdio')

lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/templates.py ADDED Viewed

	@@ -0,0 +1,777 @@

+from datetime import datetime
+def airflow_lineage_syntax_analysis():
+    return """
+            You are an Airflow DAG decomposition expert. Your task is to parse an Airflow DAG Python file and extract a clean breakdown of each task as logical units, including key operators, dependencies, and parameters.
+            Instructions:
+            - Extract complete Airflow tasks (not individual lines).
+            - Include task_id, operator name, and any important arguments (e.g., sql, bash_command, python_callable).
+            - Identify upstream/downstream task relationships.
+            - Do NOT include imports, default_args, or DAG definitions unless they affect task behavior directly.
+            - For TaskGroups or dynamic mapping, expand each logical unit clearly.
+            Output Format (JSON):
+            {
+            "tasks": [
+                {
+                "task_id": "<task_id>",
+                "operator": "<OperatorName>",
+                "params": {
+                    "key1": "value1",
+                    ...
+                },
+                "upstream": ["<task_id_1>", "<task_id_2>"],
+                "downstream": ["<task_id_3>"]
+                },
+                ...
+            ]
+            }
+            ---
+            Positive Example 1: Basic Bash DAG
+            Input:
+            from airflow import DAG
+            from airflow.operators.bash import BashOperator
+            with DAG('sample_dag') as dag:
+                t1 = BashOperator(task_id='start', bash_command='echo "start"')
+                t2 = BashOperator(task_id='process', bash_command='python run_job.py')
+                t3 = BashOperator(task_id='end', bash_command='echo "done"')
+                t1 >> t2 >> t3
+            Expected Output:
+            {
+            "tasks": [
+                {
+                "task_id": "start",
+                "operator": "BashOperator",
+                "params": { "bash_command": "echo \"start\"" },
+                "upstream": [],
+                "downstream": ["process"]
+                },
+                {
+                "task_id": "process",
+                "operator": "BashOperator",
+                "params": { "bash_command": "python run_job.py" },
+                "upstream": ["start"],
+                "downstream": ["end"]
+                },
+                {
+                "task_id": "end",
+                "operator": "BashOperator",
+                "params": { "bash_command": "echo \"done\"" },
+                "upstream": ["process"],
+                "downstream": []
+                }
+            ]
+            }
+            ---
+            Positive Example 2: PythonOperator DAG
+            Input:
+            from airflow import DAG
+            from airflow.operators.python import PythonOperator
+            def fetch_data():
+                return "data"
+            def transform_data():
+                return "transformed"
+            with DAG('etl_dag') as dag:
+                extract = PythonOperator(task_id='extract', python_callable=fetch_data)
+                transform = PythonOperator(task_id='transform', python_callable=transform_data)
+                extract >> transform
+            Expected Output:
+            {
+            "tasks": [
+                {
+                "task_id": "extract",
+                "operator": "PythonOperator",
+                "params": { "python_callable": "fetch_data" },
+                "upstream": [],
+                "downstream": ["transform"]
+                },
+                {
+                "task_id": "transform",
+                "operator": "PythonOperator",
+                "params": { "python_callable": "transform_data" },
+                "upstream": ["extract"],
+                "downstream": []
+                }
+            ]
+            }
+            ---
+            Positive Example 3: Branching with BranchPythonOperator
+            Input:
+            from airflow import DAG
+            from airflow.operators.python import PythonOperator, BranchPythonOperator
+            from airflow.operators.dummy import DummyOperator
+            def choose_path():
+                return "path_a"
+            with DAG('branch_dag') as dag:
+                start = DummyOperator(task_id='start')
+                branch = BranchPythonOperator(task_id='branch', python_callable=choose_path)
+                path_a = DummyOperator(task_id='path_a')
+                path_b = DummyOperator(task_id='path_b')
+                end = DummyOperator(task_id='end')
+                start >> branch >> [path_a, path_b]
+                [path_a, path_b] >> end
+            Expected Output:
+            {
+            "tasks": [
+                {
+                "task_id": "start",
+                "operator": "DummyOperator",
+                "params": {},
+                "upstream": [],
+                "downstream": ["branch"]
+                },
+                {
+                "task_id": "branch",
+                "operator": "BranchPythonOperator",
+                "params": { "python_callable": "choose_path" },
+                "upstream": ["start"],
+                "downstream": ["path_a", "path_b"]
+                },
+                {
+                "task_id": "path_a",
+                "operator": "DummyOperator",
+                "params": {},
+                "upstream": ["branch"],
+                "downstream": ["end"]
+                },
+                {
+                "task_id": "path_b",
+                "operator": "DummyOperator",
+                "params": {},
+                "upstream": ["branch"],
+                "downstream": ["end"]
+                },
+                {
+                "task_id": "end",
+                "operator": "DummyOperator",
+                "params": {},
+                "upstream": ["path_a", "path_b"],
+                "downstream": []
+                }
+            ]
+            }
+            ---
+            Positive Example 4: TaskGroup
+            Input:
+            from airflow import DAG
+            from airflow.operators.dummy import DummyOperator
+            from airflow.utils.task_group import TaskGroup
+            with DAG('grouped_dag') as dag:
+                start = DummyOperator(task_id='start')
+                end = DummyOperator(task_id='end')
+                with TaskGroup('transformations') as tg:
+                    t1 = DummyOperator(task_id='clean')
+                    t2 = DummyOperator(task_id='enrich')
+                    t1 >> t2
+                start >> tg >> end
+            Expected Output:
+            {
+            "tasks": [
+                {
+                "task_id": "start",
+                "operator": "DummyOperator",
+                "params": {},
+                "upstream": [],
+                "downstream": ["transformations.clean"]
+                },
+                {
+                "task_id": "transformations.clean",
+                "operator": "DummyOperator",
+                "params": {},
+                "upstream": ["start"],
+                "downstream": ["transformations.enrich"]
+                },
+                {
+                "task_id": "transformations.enrich",
+                "operator": "DummyOperator",
+                "params": {},
+                "upstream": ["transformations.clean"],
+                "downstream": ["end"]
+                },
+                {
+                "task_id": "end",
+                "operator": "DummyOperator",
+                "params": {},
+                "upstream": ["transformations.enrich"],
+                "downstream": []
+                }
+            ]
+            }
+            ---
+            Positive Example 5: Dynamic Task Mapping with expand()
+            Input:
+            from airflow import DAG
+            from airflow.operators.python import PythonOperator
+            def greet(name):
+                print(f"Hello {name}")
+            with DAG('dynamic_dag') as dag:
+                greet_task = PythonOperator.partial(
+                    task_id='greet',
+                    python_callable=greet
+                ).expand(op_args=[["Alice", "Bob", "Charlie"]])
+            Expected Output:
+            {
+            "tasks": [
+                {
+                "task_id": "greet",
+                "operator": "PythonOperator.expand",
+                "params": {
+                    "python_callable": "greet",
+                    "op_args": ["Alice", "Bob", "Charlie"]
+                },
+                "upstream": [],
+                "downstream": []
+                }
+            ]
+            }
+            ---
+            Negative Example 1:
+            Input:
+            from airflow import DAG
+            from airflow.operators.python import PythonOperator
+            def fetch():
+                return "data"
+            with DAG('bad_dag') as dag:
+                task = PythonOperator(task_id='fetch', python_callable=fetch)
+            Incorrect Output:
+            {
+            "fetch": "PythonOperator"
+            }
+            Reason:
+            - The structure is invalid:
+            - It lacks required `"tasks"` array.
+            - It omits the `"params"` block.
+            - It does not specify upstream/downstream relationships.
+            """
+def airflow_lineage_field_derivation():
+    return """
+            You are an Airflow task field mapping analysis expert. Your task is to analyze each task in an Airflow DAG and determine:
+            1. What input data or fields it depends on.
+            2. What transformations it performs.
+            3. What output data or fields it produces.
+            Instructions:
+            - Focus on operators like BashOperator, PythonOperator, SQL-related operators, etc.
+            - Do NOT analyze Airflow scheduling logic or metadata unless it affects lineage.
+            - For PythonOperators, infer logic from the function if possible.
+            - For SQL or BashOperators, parse the SQL or script if included.
+            - Your job is to extract lineage-relevant inputs, transformations, and outputs.
+            - look into all the operators and their parameters, and infer the inputs, outputs, and transformations.
+            - if the operator is a PythonOperator, look into the function and infer the inputs, outputs, and transformations.
+            - if the operator is a SQLOperator, look into the SQL and infer the inputs, outputs, and transformations.
+            - if the operator is a BashOperator, look into the Bash command and infer the inputs, outputs, and transformations.
+            - if the operator is a PostgresOperator, look into the SQL and infer the inputs, outputs, and transformations.
+            - if the operator is a MySQLOperator, look into the SQL and infer the inputs, outputs, and transformations.
+            - if the operator is a OracleOperator, look into the SQL and infer the inputs, outputs, and transformations.
+            - if the operator is a SparkOperator, look into the Spark code and infer the inputs, outputs, and transformations.
+            - if the operator is a HiveOperator, look into the Hive code and infer the inputs, outputs, and transformations.
+            - if the operator is a KafkaOperator, look into the Kafka code and infer the inputs, outputs, and transformations.
+            - if the operator is a S3Operator, look into the S3 code and infer the inputs, outputs, and transformations.
+            - if the operator is a GCSOperator, look into the GCS code and infer the inputs, outputs, and transformations.
+            - if the operator is a FTPOperator, look into the FTP code and infer the inputs, outputs, and transformations.
+            - if the operator is a SFTPOperator, look into the SFTP code and infer the inputs, outputs, and transformations.
+            Output Format:
+            [
+            { "output_fields": [ {
+            "namespace": "<INPUT_NAMESPACE>",
+            "name": "<INPUT_NAME>",
+            "field": "<INPUT_FIELD_NAME>",
+            "transformation": "<description of logic>"
+            } ] },
+            ...
+            ]
+            Positive Example :
+            Input:
+            from airflow import DAG
+            from airflow.operators.python import PythonOperator
+            from datetime import datetime
+            import pandas as pd
+            import numpy as np
+            import shutil
+            def fetch_raw_data():
+                # Simulate a data pull or raw copy
+                shutil.copy('/data/source/raw_customers.csv', '/data/input/customers.csv')
+            def transform_customer_data():
+                df = pd.read_csv('/data/input/customers.csv')
+                df['first_name'] = df['first_name'].str.strip().str.title()
+                df['last_name'] = df['last_name'].str.strip().str.title()
+                df['full_name'] = df['first_name'] + ' ' + df['last_name']
+                df['birthdate'] = pd.to_datetime(df['birthdate'])
+                df['age'] = (pd.Timestamp('today') - df['birthdate']).dt.days // 365
+                df['age_group'] = np.where(df['age'] >= 60, 'Senior',
+                                    np.where(df['age'] >= 30, 'Adult', 'Young'))
+                df = df[df['email'].notnull()]
+                df.to_csv('/data/output/cleaned_customers.csv', index=False)
+            def load_to_warehouse():
+                # Load cleaned data to customers_1 table in database
+                df = pd.read_csv('/data/output/cleaned_customers.csv')
+                # Get database connection
+                pg_hook = PostgresHook(postgres_conn_id='warehouse_connection')
+                engine = pg_hook.get_sqlalchemy_engine()
+                # Write to customers_1 table
+                df.to_sql('customers_1', engine, if_exists='replace', index=False)
+                print(f"Successfully loaded {len(df)} records to customers_1 table")
+            default_args = {
+                'start_date': datetime(2025, 8, 1),
+            }
+            with DAG(
+                dag_id='customer_etl_pipeline_extended',
+                default_args=default_args,
+                schedule_interval='@daily',
+                catchup=False,
+                tags=['etl', 'example']
+            ) as dag:
+                ff = PythonOperator(
+                    task_id='fetch_data',
+                    python_callable=fetch_raw_data
+                )
+                tt = PythonOperator(
+                    task_id='transform_and_clean',
+                    python_callable=transform_customer_data
+                )
+                ll = PythonOperator(
+                    task_id='load_to_warehouse',
+            python_callable=load_to_warehouse
+                )
+                ff >> tt >> ll
+            Expected Output:
+            {
+            "output_fields": [
+                 {
+                "namespace": "default",
+                "name": "customers.csv",
+                "field": "first_name",
+                "transformation": "Strip and title case"
+                },
+                {
+                "namespace": "default",
+                "name": "customers.csv",
+                "field": "last_name",
+                "transformation": "Strip and title case"
+                },
+                {
+                "namespace": "default",
+                "name": "customers.csv",
+                "field": "full_name",
+                "transformation": "Concatenation with space"
+                },
+                {
+                "namespace": "default",
+                "name": "customers.csv",
+                "field": "birthdate",
+                "transformation": "Convert to datetime"
+                },
+                {
+                "namespace": "default",
+                "name": "customers.csv",
+                "field": "age",
+                "transformation": "Calculate age"
+                },
+                {
+                "namespace": "default",
+                "name": "customers.csv",
+                "field": "age_group",
+                "transformation": "Group by age"
+                },
+                {
+                "namespace": "default",
+                "name": "customers.csv",
+                "field": "email",
+                "transformation": "Remove nulls"
+                }
+                ],
+            }
+            """
+def airflow_lineage_operation_tracing():
+    return """
+        You are a logical operator analysis expert for Airflow DAGs. Your task is to inspect each task’s logic and extract the logical operations applied to data fields. This includes:
+            - Filters
+            - Joins (if any SQL is embedded or implied)
+            - Group by / Having
+            - Order by
+            - Other conditional logic (e.g., CASE, EXISTS, .apply filters)
+            Instructions:
+            - Only include fields involved in logic, not all fields.
+            - Tasks using Python callables or SQL should be parsed and analyzed.
+            - Bash commands are only considered if they invoke Python/SQL/CLI logic that performs data filtering or selection.
+            Output Format:
+            {
+            "logical_operators": [
+                {
+                "task_id": "<task_id>",
+                "source_fields": ["<field1>", "<field2>", ...],
+                "logical_operators": {
+                    "filters": ["..."],
+                    "joins": ["..."],
+                    "group_by": ["..."],
+                    "having": ["..."],
+                    "order_by": ["..."],
+                    "other": ["..."]
+                }
+                }
+            ]
+            }
+            ---
+            Positive Example 1:
+            Input:
+            from airflow.operators.postgres_operator import PostgresOperator
+            t1 = PostgresOperator(
+                task_id='filter_active_users',
+                sql='SELECT id, name FROM users WHERE status = \'active\' ORDER BY name',
+                postgres_conn_id='analytics_db'
+            )
+            Expected Output:
+            {
+            "logical_operators": [
+                {
+                "task_id": "filter_active_users",
+                "source_fields": ["status", "name"],
+                "logical_operators": {
+                    "filters": ["status = 'active'"],
+                    "order_by": ["name"]
+                }
+                }
+            ]
+            }
+            ---
+            Positive Example 2:
+            Input:
+            from airflow.operators.python import PythonOperator
+            def filter_sales():
+                import pandas as pd
+                df = pd.read_csv("sales.csv")
+                filtered = df[df["region"] == "EU"]
+                result = filtered[filtered["amount"] > 1000]
+                return result
+            t2 = PythonOperator(
+                task_id='filter_sales',
+                python_callable=filter_sales
+            )
+            Expected Output:
+            {
+            "logical_operators": [
+                {
+                "task_id": "filter_sales",
+                "source_fields": ["region", "amount"],
+                "logical_operators": {
+                    "filters": ["df['region'] == 'EU'", "filtered['amount'] > 1000"]
+                }
+                }
+            ]
+            }
+            ---
+            Negative Example 1:
+            Input:
+            from airflow.operators.bash import BashOperator
+            t3 = BashOperator(
+                task_id='run_model',
+                bash_command='python model.py'
+            )
+            Incorrect Output:
+            {
+            "logical_operators": [
+                {
+                "task_id": "run_model",
+                "source_fields": ["model"],
+                "logical_operators": {
+                    "filters": ["--use-gpu"]
+                }
+                }
+            ]
+            }
+            Reason:
+            - BashOperator with a generic script path provides no visible logical operations on data.
+            - There is no SQL or Python code to analyze for filtering, joining, or grouping.
+            - No valid field-level logic can be inferred.
+        """
+def airflow_lineage_event_composer():
+    return """
+            You are an OpenLineage lineage generation expert for Apache Airflow DAGs.
+            Your job is to take parsed DAG tasks, field mappings, and logical operations, and generate a **single OpenLineage event JSON** representing full lineage across the DAG.
+            ---
+            ### You will receive:
+            1. **DAG Task Breakdown** (with dependencies, task_ids, operator type, params)
+            2. **Field Mappings** per task:
+            [
+            {
+                "task_id": "<task_id>",
+                "inputs": [...],
+                "outputs": [...],
+                "transformations": [...]
+            }
+            ]
+            3. **Logical Operators** per task:
+            [
+            {
+                "task_id": "<task_id>",
+                "source_fields": [...],
+                "logical_operators": {
+                "filters": [...],
+                "joins": [...],
+                "group_by": [...],
+                "having": [...],
+                "order_by": [...],
+                "other": [...]
+                }
+            }
+            ]
+            ---
+            ### Your Task:
+            Generate **one OpenLineage event JSON** that captures the full end-to-end data flow and transformations in the DAG.
+            Strictly follow the format below:
+            - Do NOT rename, flatten, or restructure any fields or keys.
+            - Output only the final OpenLineage JSON — no extra text, comments, or explanation.
+            - `inputs` should represent input **datasets**, not individual fields.
+  4. Based on following examples generate <INPUT_NAMESPACE>, <INPUT_NAME>, <OUTPUT_NAMESPACE>, <OUTPUT_NAME> for Apache Airflow DAGs and tasks (file-based sources/targets, SQL-based operators, cloud storage operators, in-memory variables):
+            Airflow PythonOperator (reads local file)
+            def _read_file():
+                with open("/data/raw/customers.csv") as f:
+                    return f.read()
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
+            <INPUT_NAME> or <OUTPUT_NAME>: file./data/raw/customers.csv
+            Airflow PythonOperator (writes local file)
+            def _write_file(data):
+                with open("/data/curated/customers_curated.csv", "w") as f:
+                    f.write(data)
+            Expected:
+            <OUTPUT_NAMESPACE>: default
+            <OUTPUT_NAME>: file./data/curated/customers_curated.csv
+            Airflow BashOperator (reads S3 file)
+            bash_command="aws s3 cp s3://datalake/raw/events/2025-08-01.json -"
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
+            <INPUT_NAME> or <OUTPUT_NAME>: s3./datalake/raw/events/2025-08-01.json
+            Airflow BashOperator (writes S3 file)
+            bash_command="aws s3 cp /tmp/output.json s3://warehouse/gold/output.json"
+            Expected:
+            <OUTPUT_NAMESPACE>: default
+            <OUTPUT_NAME>: s3./warehouse/gold/output.json
+            Airflow SQL operators (PostgresOperator with schema.table)
+            sql="SELECT * FROM analytics.orders"
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
+            <INPUT_NAME> or <OUTPUT_NAME>: analytics.orders
+            Airflow SQL operators (BigQueryOperator with project.dataset.table)
+            sql="SELECT id FROM project123.dataset456.customers"
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: project123
+            <INPUT_NAME> or <OUTPUT_NAME>: dataset456.customers
+            Airflow S3ToRedshiftOperator
+            s3_bucket="datalake", s3_key="bronze/sales.csv", table="analytics.sales"
+            Expected:
+            <INPUT_NAMESPACE>: default
+            <INPUT_NAME>: s3./datalake/bronze/sales.csv
+            <OUTPUT_NAMESPACE>: default
+            <OUTPUT_NAME>: analytics.sales
+            Airflow LocalFilesystemToGCSOperator
+            src="/tmp/data.json", dst="bronze/data.json"
+            Expected:
+            <INPUT_NAMESPACE>: default
+            <INPUT_NAME>: file./tmp/data.json
+            <OUTPUT_NAMESPACE>: default
+            <OUTPUT_NAME>: gs./bronze/data.json
+            Airflow in-memory XCom variable
+            ti.xcom_push(key="intermediate_data", value=[1,2,3])
+            Expected:
+            <OUTPUT_NAMESPACE>: temp
+            <OUTPUT_NAME>: intermediate_data
+            Airflow XCom read
+            data = ti.xcom_pull(key="intermediate_data")
+            Expected:
+            <INPUT_NAMESPACE>: temp
+            <INPUT_NAME>: intermediate_data
+            Notes:
+            - Use scheme prefixes for path-like sources/targets:
+                file./absolute/or/relative/path
+                s3./bucket/key
+                gs./bucket/key
+                abfs./container/path
+            - For in-memory XComs or Python variables, use:
+                <NAMESPACE> = temp
+                <NAME> = <variable_or_key_name>
+            - For SQL-based operators:
+                BigQuery: namespace = <project>, name = <dataset.table>
+                Postgres/MySQL: namespace = default, name = <schema.table>
+                SQL Server: namespace = <database>, name = <schema.table>
+        - Wherever you can't find information for <STORAGE_LAYER>, <FILE_FORMAT>, <DATASET_TYPE>, <SUB_TYPE>, <LIFECYCLE>, <OWNER_NAME>, <OWNER_TYPE>, <SUBTYPE>, <DESCRIPTION> then write "NA".
+        - Very important: Your output must follow exactly the specified JSON structure — do not output explanations, comments, or anything else.
+        - very very very important: Your output must follow **exactly** this JSON structure — do not output explanations, comments, or anything else.
+                ---
+                ### Required Output Format (Example):
+           {
+                "inputs": [
+                    {
+                        "namespace": "<INPUT_NAMESPACE>",
+                        "name": "<INPUT_NAME>",
+                        "facets": {
+                            "schema": {
+                                "fields": [
+                                    {
+                                    "name": "<FIELD_NAME>",
+                                    "type": "<FIELD_TYPE>",
+                                    "description": "<FIELD_DESCRIPTION>"
+                                    }
+                                ]
+                            }
+                        }
+                    }
+                ],
+                "outputs": [
+                    {
+                        "namespace": "<OUTPUT_NAMESPACE>",
+                        "name": "<OUTPUT_NAME>",
+                        "facets": {
+                            "columnLineage": {
+                                "fields": {
+                                    "<OUTPUT_FIELD_NAME>": {
+                                    "inputFields": [
+                                        {
+                                        "namespace": "<INPUT_NAMESPACE>",
+                                        "name": "<INPUT_NAME>",
+                                        "field": "<INPUT_FIELD_NAME>",
+                                        "transformations": [
+                                            {
+                                            "type": "<TRANSFORMATION_TYPE>",
+                                            "subtype": "<SUBTYPE>",
+                                            "description": "<DESCRIPTION>",
+                                            "masking": false
+                                            }
+                                        ]
+                                        }
+                                    ]
+                                    }
+                                }
+                            }
+                        }
+                    }
+                ]
+            }
+        4. Return only results in above mentioned json schema format. do not add any text.
+        """

lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_params.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+from dotenv import load_dotenv
+load_dotenv(override=True)
+# python_lineage_agent mcp server params
+airflow_mcp_server_params = [
+    {"command": "python", "args": ["lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py"]},
+]

lf_algorithm/plugins/java_lineage_agent/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

lf_algorithm/plugins/java_lineage_agent/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (206 Bytes). View file

lf_algorithm/plugins/java_lineage_agent/__pycache__/java_instructions.cpython-313.pyc ADDED Viewed

Binary file (5.15 kB). View file

lf_algorithm/plugins/java_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc ADDED Viewed

Binary file (5.84 kB). View file

lf_algorithm/plugins/java_lineage_agent/java_instructions.py ADDED Viewed

	@@ -0,0 +1,98 @@

+def comprehensive_analysis_instructions(name: str):
+    return f"""
+    You are the {name} Java lineage analysis agent.
+    **Your Task:** Perform complete Java code lineage analysis in a single comprehensive process.
+    **Complete Analysis Process:**
+    **Step 1: Syntax Analysis**
+    1. Call the java_lineage_syntax_analysis() MCP tool to get expert instructions
+    2. Follow those instructions exactly to analyze the Java code structure
+    3. Store the syntax analysis results for use in subsequent steps
+    **Step 2: Field Derivation**
+    1. Call the java_lineage_field_derivation() MCP tool to get expert instructions
+    2. Use the syntax analysis results from Step 1 to inform your field mapping analysis
+    3. Follow the MCP tool instructions exactly to analyze field mappings and transformations
+    4. Store the field derivation results
+    **Step 3: Operation Tracing**
+    1. Call the java_lineage_operation_tracing() MCP tool to get expert instructions
+    2. Use the syntax analysis results from Step 1 to inform your operation analysis
+    3. Follow the MCP tool instructions exactly to analyze logical operations and operators
+    4. Store the operation tracing results
+    **Step 4: Event Composition**
+    1. Call the java_lineage_event_composer() MCP tool to get expert instructions
+    2. Combine all previous analysis results (syntax, field derivation, operation tracing)
+    3. Follow the MCP tool instructions exactly to compose the final OpenLineage event
+    4. Return the complete OpenLineage event
+    **Important Guidelines:**
+    - Each MCP tool contains detailed instructions, examples, and output format requirements
+    - Follow the MCP tool instructions precisely for each step
+    - Maintain context between steps - use results from earlier steps to inform later analysis
+    - Ensure the final output is a complete, properly formatted OpenLineage event
+    - If any step fails, provide clear error information and stop the process
+    **Workflow Summary:**
+    Syntax Analysis → Field Derivation → Operation Tracing → Event Composition → Final Output
+    """
+# Keep the individual instructions for backward compatibility if needed
+def syntax_analysis_instructions(name: str):
+    return f"""
+    You are the {name} Java lineage analysis agent.
+    **Your Task:** Analyze the provided Java code for syntax structure.
+    **Process:**
+    1. Call the java_lineage_syntax_analysis() MCP tool to get expert instructions
+    2. Follow those instructions exactly to analyze the Java code
+    3. Return the analysis results in the format specified by the MCP tool
+    **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
+    """
+def field_derivation_instructions(name: str):
+    return f"""
+    You are the {name} Java lineage analysis agent.
+    **Your Task:** Analyze field mappings and transformations in the Java code.
+    **Process:**
+    1. Call the java_lineage_field_derivation() MCP tool to get expert instructions
+    2. Follow those instructions exactly to analyze field mappings
+    3. Return the analysis results in the format specified by the MCP tool
+    **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
+    """
+def operation_tracing_instructions(name: str):
+    return f"""
+    You are the {name} Java lineage analysis agent.
+    **Your Task:** Analyze logical operations and operators in the Java code.
+    **Process:**
+    1. Call the java_lineage_operation_tracing() MCP tool to get expert instructions
+    2. Follow those instructions exactly to analyze logical operations
+    3. Return the analysis results in the format specified by the MCP tool
+    **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
+    """
+def event_composer_instructions(name: str):
+    return f"""
+    You are the {name} Java lineage analysis agent.
+    **Your Task:** Compose OpenLineage events from the provided analysis data.
+    **Process:**
+    1. Call the java_lineage_event_composer() MCP tool to get expert instructions
+    2. Follow those instructions exactly to compose the OpenLineage event
+    3. Return the event in the format specified by the MCP tool
+    **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
+    """

lf_algorithm/plugins/java_lineage_agent/lineage_agent.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import sys
+import logging
+from contextlib import AsyncExitStack
+from agents import Agent, Tool, Runner, trace
+from agents.mcp.server import MCPServerStdio
+from typing import Dict, Any, Optional
+from ...utils.tracers import log_trace_id
+from ...plugins.java_lineage_agent.java_instructions import comprehensive_analysis_instructions
+from ...plugins.java_lineage_agent.mcp_servers.mcp_params import java_mcp_server_params
+from ...utils.file_utils import dump_json_record
+# Get logger for this module
+logger = logging.getLogger(__name__)
+MAX_TURNS = 30  # Increased for comprehensive analysis
+class JavaLineageAgent:
+    """Plugin agent for Java lineage analysis"""
+    def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None):
+        self.agent_name = agent_name
+        self.model_name = model_name
+        self.source_code = source_code
+        self.get_model_func = get_model_func
+    async def create_agent(self, java_mcp_servers) -> Agent:
+        # Use the passed get_model_func or fall back to the centralized one
+        if self.get_model_func:
+            model = self.get_model_func(self.model_name)
+        else:
+            from ...utils import get_model
+            model = get_model(self.model_name)
+        agent = Agent(
+            name=self.agent_name,
+            instructions=comprehensive_analysis_instructions(self.agent_name),
+            model=model,
+            mcp_servers=java_mcp_servers,
+        )
+        return agent
+    async def run_agent(self, java_mcp_servers, source_code: str):
+        # Create single agent for comprehensive analysis
+        comprehensive_agent = await self.create_agent(java_mcp_servers)
+        # Run the complete analysis in one go
+        result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS)
+        # Return the final output
+        return dump_json_record(self.agent_name, result.final_output)
+    async def run_with_mcp_servers(self, source_code: str):
+        async with AsyncExitStack() as stack:
+            java_mcp_servers = [
+                await stack.enter_async_context(
+                    MCPServerStdio(params, client_session_timeout_seconds=120)
+                )
+                for params in java_mcp_server_params
+            ]
+            return await self.run_agent(java_mcp_servers, source_code=source_code)
+    async def run_with_trace(self, source_code: str):
+        trace_name = f"{self.agent_name}-lineage-agent"
+        trace_id = log_trace_id(f"{self.agent_name.lower()}")
+        with trace(trace_name, trace_id=trace_id):
+            return await self.run_with_mcp_servers(source_code=source_code)
+    async def run(self):
+        try:
+            logger.info(f"Starting Java lineage analysis for {self.agent_name}")
+            result = await self.run_with_trace(self.source_code)
+            logger.info(f"Completed Java lineage analysis for {self.agent_name}")
+            return result
+        except Exception as e:
+            logger.error(f"Error running {self.agent_name}: {e}")
+            return {"error": str(e)}
+# Plugin interface functions
+def create_java_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> JavaLineageAgent:
+    """Factory function to create a JavaLineageAgent instance"""
+    return JavaLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func)
+def get_plugin_info() -> Dict[str, Any]:
+    """Return plugin metadata"""
+    return {
+        "name": "java-lineage-agent",
+        "description": "Java lineage analysis agent for parsing and analyzing Java queries",
+        "version": "1.0.0",
+        "author": "Ali Shamsaddinlou",
+        "agent_class": JavaLineageAgent,
+        "factory_function": create_java_lineage_agent,
+    }

lf_algorithm/plugins/java_lineage_agent/mcp_servers/__init__.py ADDED Viewed

File without changes

lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (218 Bytes). View file

lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc ADDED Viewed

Binary file (500 Bytes). View file

lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/__init__.py ADDED Viewed

File without changes

lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+# Configure logging to suppress verbose output
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger('mcp').setLevel(logging.WARNING)
+logging.getLogger('mcp.server').setLevel(logging.WARNING)
+from mcp.server.fastmcp import FastMCP
+from typing import Dict, Any
+mcp = FastMCP("lineage_java_server")
+from templates import (java_lineage_syntax_analysis as syntax_analysis_template,
+                       java_lineage_field_derivation as field_derivation_template,
+                       java_lineage_operation_tracing as operation_tracing_template,
+                       java_lineage_event_composer as event_composer_template)
+@mcp.tool()
+async def java_lineage_syntax_analysis() -> Dict[str, Any]:
+    """Java lineage structure and syntax decomposition expert"""
+    return {
+        "instructions": syntax_analysis_template(),
+        "version": "1.0.0",
+        "capabilities": ["java_parsing", "method_extraction", "block_analysis"]
+    }
+@mcp.tool()
+async def java_lineage_field_derivation() -> Dict[str, Any]:
+    """Field mapping and field derivation expert"""
+    return {
+        "instructions": field_derivation_template(),
+        "version": "1.0.0",
+        "capabilities": ["field_mapping", "transformation_analysis", "column_lineage"]
+    }
+@mcp.tool()
+async def java_lineage_operation_tracing() -> Dict[str, Any]:
+    """Logical operator analysis and operation tracing expert"""
+    return {
+        "instructions": operation_tracing_template(),
+        "version": "1.0.0",
+        "capabilities": ["filter_analysis", "stream_analysis", "aggregation_tracking"]
+    }
+@mcp.tool()
+async def java_lineage_event_composer() -> Dict[str, Any]:
+    """Event composition and aggregation expert"""
+    return {
+        "instructions": event_composer_template(),
+        "version": "1.0.0",
+        "capabilities": ["openlineage_generation", "event_composition", "metadata_aggregation"]
+    }
+if __name__ == "__main__":
+    mcp.run(transport='stdio')

lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/templates.py ADDED Viewed

	@@ -0,0 +1,605 @@

+from datetime import datetime
+def java_lineage_syntax_analysis():
+    return """
+            You are a Java data pipeline decomposition expert. Your task is to analyze complex Java source files and extract discrete, logical transformation blocks. These include data source initialization, filtering, transformation, aggregation, feature derivation, and any computation logic. Each extracted block should be meaningful, self-contained, and independently interpretable.
+            Instructions:
+            - Extract: Complete transformation steps, including data source initialization, filtering, mapping, joining, grouping, calculating, or any pre/postprocessing blocks.
+            - Do NOT extract single lines unless they represent a standalone logical operation or setup (e.g., reading a file, defining a method, or a full map/filter chain).
+            - Group tightly related chained operations (e.g., Java Stream chains) into a single transformation unit.
+            - Preserve entire method definitions or reusable transformation blocks intact.
+            - Comment lines (// ...) can help guide naming but should not be extracted on their own.
+            Output Format (JSON):
+            {
+            "sp1": { "name": "<descriptive_name>", "code": "<valid_java_code_block>" },
+            "sp2": { "name": "<descriptive_name>", "code": "<valid_java_code_block>" },
+            ...
+            }
+            ---
+            Positive Example 1:
+            Input Java:
+            import java.nio.file.*;
+            import java.util.*;
+            import java.util.stream.*;
+            public class DataProcessor {
+                public static void main(String[] args) throws Exception {
+                    // Load data
+                    List<String> lines = Files.readAllLines(Paths.get("sales.csv"));
+                    // Parse and clean data
+                    List<Sale> sales = lines.stream()
+                        .skip(1)
+                        .map(Sale::fromCsv)
+                        .filter(s -> s.getPrice() != null)
+                        .collect(Collectors.toList());
+                    // Compute revenue
+                    for (Sale s : sales) {
+                        s.setRevenue(s.getPrice() * s.getQuantity());
+                    }
+                    // Filter high revenue
+                    List<Sale> highRevenue = sales.stream()
+                        .filter(s -> s.getRevenue() > 1000)
+                        .collect(Collectors.toList());
+                }
+            }
+            Expected Output:
+            {
+            "sp1": {
+                "name": "load_sales_data_from_csv",
+                "code": "List<String> lines = Files.readAllLines(Paths.get(\"sales.csv\"));"
+            },
+            "sp2": {
+                "name": "parse_and_clean_sales_data",
+                "code": "List<Sale> sales = lines.stream()\n    .skip(1)\n    .map(Sale::fromCsv)\n    .filter(s -> s.getPrice() != null)\n    .collect(Collectors.toList());"
+            },
+            "sp3": {
+                "name": "compute_revenue_per_sale",
+                "code": "for (Sale s : sales) {\n    s.setRevenue(s.getPrice() * s.getQuantity());\n}"
+            },
+            "sp4": {
+                "name": "filter_high_revenue_sales",
+                "code": "List<Sale> highRevenue = sales.stream()\n    .filter(s -> s.getRevenue() > 1000)\n    .collect(Collectors.toList());"
+            }
+            }
+            ---
+            Positive Example 2 (with method definition):
+            Input Java:
+            public static List<Double> normalize(List<Double> values) {
+                double mean = values.stream().mapToDouble(v -> v).average().orElse(0.0);
+                double std = Math.sqrt(values.stream().mapToDouble(v -> Math.pow(v - mean, 2)).average().orElse(0.0));
+                return values.stream().map(v -> (v - mean) / std).collect(Collectors.toList());
+            }
+            // In main
+            List<Double> incomes = loadIncomeData();  // Assume loaded
+            List<Double> normalized = normalize(incomes);
+            Expected Output:
+            {
+            "sp1": {
+                "name": "define_normalize_method",
+                "code": "public static List<Double> normalize(List<Double> values) {\n    double mean = values.stream().mapToDouble(v -> v).average().orElse(0.0);\n    double std = Math.sqrt(values.stream().mapToDouble(v -> Math.pow(v - mean, 2)).average().orElse(0.0));\n    return values.stream().map(v -> (v - mean) / std).collect(Collectors.toList());\n}"
+            },
+            "sp2": {
+                "name": "load_income_data",
+                "code": "List<Double> incomes = loadIncomeData();"
+            },
+            "sp3": {
+                "name": "normalize_income_values",
+                "code": "List<Double> normalized = normalize(incomes);"
+            }
+            }
+            ---
+            Negative Example (Too granular):
+            {
+            "sp1": { "name": "skip_header", "code": "lines.stream().skip(1)" },
+            "sp2": { "name": "filter_null_price", "code": ".filter(s -> s.getPrice() != null)" }
+            }
+            Reason: These operations are tightly chained and should be grouped into a cohesive transformation step.
+            """
+def java_lineage_field_derivation():
+    return """
+            You are a Java field mapping analysis expert. Given a Java code snippet (typically part of a data transformation pipeline), your job is to extract and explain how each output field or variable is derived. For each, identify:
+            1. The **source field(s)** or variables it depends on
+            2. The **transformation logic** applied (e.g., arithmetic operation, aggregation, string manipulation, method call, etc.)
+            Output Format:
+            {
+            "output_fields": [
+                {
+                "namespace": "<INPUT_NAMESPACE>",
+                "name": "<INPUT_NAME>",
+                "field": "<INPUT_FIELD_NAME>",
+                "transformation": "<description of logic>"
+                },
+                ...
+            ]
+            }
+            ---
+            Positive Example 1:
+            Input Java:
+            read from table employee
+            Employee employee = new Employee();
+            employee.setAnnualSalary(employee.getMonthlySalary() * 12);
+            Expected Output:
+            {
+            "output_fields": [
+                {
+                "namespace": "default",
+                "name": "employee",
+                "field": "monthlySalary",
+                "transformation": "Multiplied by 12"
+                }
+            ]
+            }
+            ---
+            Positive Example 2:
+            Input Java:
+            user.setFullName(user.getFirstName().toUpperCase() + " " + user.getLastName());
+            Expected Output:
+            {
+            "output_fields": [
+                {
+                "namespace": "default",
+                "name": "user",
+                "field": "firstName",
+                "transformation": "Concatenation with space; UPPER applied to first name"
+                },
+                {
+                "namespace": "default",
+                "name": "user",
+                "field": "lastName",
+                "transformation": "Concatenation with space; UPPER applied to last name"
+                }
+            ]
+            }
+            ---
+            Negative Example 1 (Incorrect: Unstructured):
+            {
+            "annualSalary": "employee.getMonthlySalary() * 12"
+            }
+            Reason: This is a raw expression and doesn’t explain the transformation clearly or follow the expected schema.
+            ---
+            Negative Example 2 (Incorrect: Missing logic):
+            Input Java:
+            invoice.setTax(invoice.getIncome() * 0.3);
+            Incorrect Output:
+            {
+            "output_fields": [
+                {
+                "name": "tax",
+                "source": "invoice.getIncome()",
+                "transformation": "Direct"
+                }
+            ]
+            }
+            Reason: Transformation logic must describe that it was "Multiplied by 0.3", not just "Direct".
+            """
+def java_lineage_operation_tracing():
+    return """
+            You are a Java logical operator analysis expert. Your task is to analyze Java code (typically using Streams, custom filter logic, or data transformation libraries) and extract all **logical operations** applied to data structures such as lists, maps, or custom data models, including:
+            - Only list the fields involved in logical operations, not all fields.
+            - WHERE-like filters (e.g., `.filter()`, `if` conditions inside loops)
+            - JOIN conditions (e.g., matching fields from two objects)
+            - GROUP BY and aggregation keys (e.g., `.collect(groupingBy(...))`)
+            - Filtering after grouping (e.g., filtering a grouped map)
+            - Sorting operations (e.g., `.sorted(Comparator.comparing(...))`)
+            - Any logical expressions affecting element selection (e.g., `.anyMatch()`, `Predicate`, custom boolean-returning lambdas)
+            Return the result in the following structured format:
+            {
+            "output_fields": [
+                {
+                "source_structure": "<list_or_collection_variable_name>",
+                "source_fields": ["<field_1>", "<field_2>", "..."],
+                "logical_operators": {
+                    "filters": [],
+                    "joins": [],
+                    "group_by": [],
+                    "having": [],
+                    "order_by": [],
+                    "other": []
+                }
+                }
+            ]
+            }
+            - Only include entries for logical operators if the list is non-empty.
+            - Represent conditions and expressions fully and clearly.
+            - Normalize filters and joins (e.g., `e.getAge() > 18`, `emp.getDeptId() == dept.getId()`)
+            - Include all source collections involved and only the fields used in logical operations.
+            ---
+            Positive Example 1:
+            Input Java:
+            List<Employee> filtered = employees.stream()
+                .filter(e -> e.getRegion().equals("US"))
+                .collect(Collectors.toList());
+            Map<String, Double> grouped = filtered.stream()
+                .collect(Collectors.groupingBy(Employee::getCustomerId, Collectors.summingDouble(Employee::getAmount)));
+            Map<String, Double> result = grouped.entrySet().stream()
+                .filter(entry -> entry.getValue() > 1000)
+                .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
+            Expected Output:
+            {
+            "output_fields": [
+                {
+                "source_structure": "employees",
+                "source_fields": ["region", "customerId", "amount"],
+                "logical_operators": {
+                    "filters": ["e.getRegion().equals(\"US\")", "entry.getValue() > 1000"],
+                    "group_by": ["Employee::getCustomerId"]
+                }
+                }
+            ]
+            }
+            ---
+            Positive Example 2:
+            Input Java:
+            List<Merged> merged = employees.stream()
+                .flatMap(emp -> departments.stream()
+                    .filter(dept -> emp.getDeptId() == dept.getId())
+                    .map(dept -> new Merged(emp, dept)))
+                .collect(Collectors.toList());
+            List<Merged> active = merged.stream()
+                .filter(m -> m.getStatus().equals("active"))
+                .sorted(Comparator.comparing(Merged::getName))
+                .collect(Collectors.toList());
+            Expected Output:
+            {
+            "output_fields": [
+                {
+                "source_structure": "employees",
+                "source_fields": ["deptId", "status", "name"],
+                "logical_operators": {
+                    "joins": ["emp.getDeptId() == dept.getId()"],
+                    "filters": ["m.getStatus().equals(\"active\")"],
+                    "order_by": ["Merged::getName"]
+                }
+                },
+                {
+                "source_structure": "departments",
+                "source_fields": ["id"],
+                "logical_operators": {
+                    "joins": ["emp.getDeptId() == dept.getId()"]
+                }
+                }
+            ]
+            }
+            ---
+            Positive Example 3:
+            Input Java:
+            List<Account> flagged = accounts.stream()
+                .peek(a -> a.setFlag(a.getStatus().equals("closed") ? 1 : 0))
+                .collect(Collectors.toList());
+            Expected Output:
+            {
+            "output_fields": [
+                {
+                "source_structure": "accounts",
+                "source_fields": ["status"],
+                "logical_operators": {
+                    "other": ["a.getStatus().equals(\"closed\") ? 1 : 0"]
+                }
+                }
+            ]
+            }
+            ---
+            Negative Example 1 (Incorrect formatting):
+            {
+            "filters": "e.getRegion().equals(\"US\")",
+            "group_by": "Employee::getCustomerId"
+            }
+            Reason: This structure is flat and omits `source_structure`, `source_fields`, and required nesting under `output_fields`.
+            ---
+            Negative Example 2 (Missing logical clause):
+            Input Java:
+            List<User> result = users.stream()
+                .filter(u -> u.getAge() > 18)
+                .sorted(Comparator.comparing(User::getSignupDate))
+                .collect(Collectors.toList());
+            Incorrect Output:
+            {
+            "output_fields": [
+                {
+                "source_structure": "users",
+                "source_fields": ["age"],
+                "logical_operators": {
+                    "filters": ["u.getAge() > 18"]
+                }
+                }
+            ]
+            }
+            Reason: The `order_by` clause is missing. `signupDate` must be included in `source_fields` and in `order_by`.
+            """
+def java_lineage_event_composer():
+    return """
+            You are an OpenLineage lineage generation expert.
+            Your job is to take the outputs from upstream Java data analysis agents and generate a **single, complete OpenLineage event JSON** representing end-to-end data lineage for the transformation pipeline.
+            ---
+            ### You will receive:
+            1. **Parsed Code Blocks** representing key transformation steps:
+            {
+            "sp1": { "name": "load_data", "code": "<Java code block>" },
+            "sp2": { "name": "filter_data", "code": "<Java code block>" },
+            "sp3": { "name": "compute_result", "code": "<Java code block>" }
+            }
+            2. **Field Mappings**: one per code block (same order), in this format:
+            [
+            {
+                "output_fields": [
+                {
+                    "name": "<output_variable_or_field>",
+                    "source": "<input_field(s) or variable(s)>",
+                    "transformation": "<description of logic>"
+                }
+                ]
+            },
+            ...
+            ]
+            3. **Logical Operators**: one per code block (same order), in this format:
+            [
+            {
+                "output_fields": [
+                {
+                    "source_structure": "<collection_name_or_stream_variable>",
+                    "source_fields": ["field1", "field2"],
+                    "logical_operators": {
+                    "filters": ["..."],
+                    "joins": ["..."],
+                    "group_by": ["..."],
+                    "having": ["..."],
+                    "order_by": ["..."],
+                    "other": ["..."]
+                    }
+                }
+                ]
+            },
+            ...
+            ]
+            ---
+            ### Your Task:
+            Generate **one event JSON** that captures the **entire pipeline** from raw source data to final derived outputs.
+            Strictly follow the structure below and do not change field names or nesting. It is **very important** to keep the exact same format:
+            - Use `"inputs"` and `"outputs"` as array keys (do NOT use `inputDataset` or `outputDataset`)
+            - Preserve `"facets"` blocks under `"job"`, `"inputs"`, and `"outputs"`
+            - Include `"columnLineage"` as a facet under `"outputs.facets"` (not at the top level)
+            - Maintain the exact field names:
+            - `"eventType"`, `"eventTime"`, `"run"`, `"job"`, `"inputs"`, `"outputs"`, `"facets"`, `"query"`, `"processingType"`, `"integration"`, etc.
+   3. you show have all the fields mentioned in following json schema.
+    4. Based on following examples generate <INPUT_NAMESPACE>, <INPUT_NAME>, <OUTPUT_NAMESPACE>, <OUTPUT_NAME> for Java code patterns (pure Java I/O, JDBC, Hibernate/JPA):
+            Pure Java (read file via NIO)
+            List<String> lines = java.nio.file.Files.readAllLines(java.nio.file.Paths.get("/data/raw/customers.csv"));
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
+            <INPUT_NAME> or <OUTPUT_NAME>: file./data/raw/customers.csv
+            Pure Java (write file)
+            java.nio.file.Files.write(java.nio.file.Paths.get("/data/curated/sales_curated.csv"), bytes);
+            Expected:
+            <OUTPUT_NAMESPACE>: default
+            <OUTPUT_NAME>: file./data/curated/sales_curated.csv
+            In-memory collections/objects
+            List<Customer> customers = new ArrayList<>();
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: temp
+            <INPUT_NAME> or <OUTPUT_NAME>: customers
+            JDBC (PostgreSQL) with explicit schema.table
+            String sql = "SELECT * FROM analytics.orders";
+            try (Connection c = DriverManager.getConnection("jdbc:postgresql://host:5432/db");
+                Statement s = c.createStatement();
+                ResultSet rs = s.executeQuery(sql))
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
+            <INPUT_NAME> or <OUTPUT_NAME>: analytics.orders
+            JDBC (MySQL) database.table
+            String sql = "SELECT u.id, u.email FROM ecommerce.users u";
+            try (Connection c = DriverManager.getConnection("jdbc:mysql://host:3306/shop");
+                Statement s = c.createStatement();
+                ResultSet rs = s.executeQuery(sql))
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
+            <INPUT_NAME> or <OUTPUT_NAME>: ecommerce.users
+            JDBC (SQL Server) database.schema.table
+            String sql = "SELECT * FROM sales.dbo.orders";
+            try (Connection c = DriverManager.getConnection("jdbc:sqlserver://host;databaseName=sales");
+                Statement s = c.createStatement();
+                ResultSet rs = s.executeQuery(sql))
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: sales
+            <INPUT_NAME> or <OUTPUT_NAME>: dbo.orders
+            JDBC (Oracle) schema.table
+            String sql = "SELECT * FROM HR.EMPLOYEES";
+            try (Connection c = DriverManager.getConnection("jdbc:oracle:thin:@//host:1521/ORCLPDB1");
+                Statement s = c.createStatement();
+                ResultSet rs = s.executeQuery(sql))
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
+            <INPUT_NAME> or <OUTPUT_NAME>: HR.EMPLOYEES
+            Hibernate / JPA (Entity with schema)
+            @Entity
+            @Table(name = "orders", schema = "sales")
+            class Order { ... }
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
+            <INPUT_NAME> or <OUTPUT_NAME>: sales.orders
+            Hibernate / JPA (Entity without schema; default schema)
+            @Entity
+            @Table(name = "customers")
+            class Customer { ... }
+            Expected:
+            <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
+            <INPUT_NAME> or <OUTPUT_NAME>: customers
+            JDBC write (INSERT into schema.table)
+            String sql = "INSERT INTO analytics.daily_metrics (run_date, total) VALUES (?, ?)";
+            Expected:
+            <OUTPUT_NAMESPACE>: default
+            <OUTPUT_NAME>: analytics.daily_metrics
+            Notes:
+            - Use scheme prefixes for path-like sources/targets when present:
+                file./absolute/or/relative/path
+                s3./bucket/key
+                gs./bucket/key
+                abfs./container/path
+            - For in-memory variables/collections, use:
+                <NAMESPACE> = temp
+                <NAME> = <variable_or_field_name>
+            - For relational sources/targets referenced via SQL, prefer <NAME> = <schema.table>. If a database/catalog prefix exists (e.g., SQL Server), map it to <NAMESPACE> and keep <NAME> = <schema.table>. Otherwise use <NAMESPACE> = default.
+            - Wherever you can't find information for <STORAGE_LAYER>, <FILE_FORMAT>, <DATASET_TYPE>, <SUB_TYPE>, <LIFECYCLE>, <OWNER_NAME>, <OWNER_TYPE>, <SUBTYPE>, <DESCRIPTION> then write "NA".
+            - Very important: Your output must follow exactly the specified JSON structure — do not output explanations, comments, or anything else.
+            - wherever you cant find information for example for <STORAGE_LAYER>, <FILE_FORMAT>,
+            <DATASET_TYPE>, <SUB_TYPE>, <LIFECYCLE>, <OWNER_NAME>,
+            <OWNER_TYPE>, <SUBTYPE>, <DESCRIPTION> then just write "NA".
+            - very very very important: Your output must follow **exactly** this JSON structure — do not output explanations, comments, or anything else.
+            ---
+            ### Required Output Format (Example):
+           {
+                "inputs": [
+                    {
+                        "namespace": "<INPUT_NAMESPACE>",
+                        "name": "<INPUT_NAME>",
+                        "facets": {
+                            "schema": {
+                                "fields": [
+                                    {
+                                    "name": "<FIELD_NAME>",
+                                    "type": "<FIELD_TYPE>",
+                                    "description": "<FIELD_DESCRIPTION>"
+                                    }
+                                ]
+                            }
+                        }
+                    }
+                ],
+                "outputs": [
+                    {
+                        "namespace": "<OUTPUT_NAMESPACE>",
+                        "name": "<OUTPUT_NAME>",
+                        "facets": {
+                            "columnLineage": {
+                                "fields": {
+                                    "<OUTPUT_FIELD_NAME>": {
+                                    "inputFields": [
+                                        {
+                                        "namespace": "<INPUT_NAMESPACE>",
+                                        "name": "<INPUT_NAME>",
+                                        "field": "<INPUT_FIELD_NAME>",
+                                        "transformations": [
+                                            {
+                                            "type": "<TRANSFORMATION_TYPE>",
+                                            "subtype": "<SUBTYPE>",
+                                            "description": "<DESCRIPTION>",
+                                            "masking": false
+                                            }
+                                        ]
+                                        }
+                                    ]
+                                    }
+                                }
+                            }
+                        }
+                    }
+                ]
+            }
+    5. Return only results in above mentioned json schema format. do not add any text.
+    """

lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_params.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+from dotenv import load_dotenv
+load_dotenv(override=True)
+# java_lineage_agent mcp server params
+java_mcp_server_params = [
+    {"command": "python", "args": ["lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py"]},
+]

lf_algorithm/plugins/python_lineage_agent/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

lf_algorithm/plugins/python_lineage_agent/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (208 Bytes). View file

lf_algorithm/plugins/python_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc ADDED Viewed

Binary file (5.88 kB). View file

lf_algorithm/plugins/python_lineage_agent/__pycache__/python_instructions.cpython-313.pyc ADDED Viewed

Binary file (5.21 kB). View file

lf_algorithm/plugins/python_lineage_agent/lineage_agent.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import sys
+import logging
+from contextlib import AsyncExitStack
+from agents import Agent, Tool, Runner, trace
+from agents.mcp.server import MCPServerStdio
+from typing import Dict, Any, Optional
+from ...utils.tracers import log_trace_id
+from ...plugins.python_lineage_agent.python_instructions import comprehensive_analysis_instructions
+from ...plugins.python_lineage_agent.mcp_servers.mcp_params import python_mcp_server_params
+from ...utils.file_utils import dump_json_record
+# Get logger for this module
+logger = logging.getLogger(__name__)
+MAX_TURNS = 30  # Increased for comprehensive analysis
+class PythonLineageAgent:
+    """Plugin agent for Python lineage analysis"""
+    def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None):
+        self.agent_name = agent_name
+        self.model_name = model_name
+        self.source_code = source_code
+        self.get_model_func = get_model_func
+    async def create_agent(self, python_mcp_servers) -> Agent:
+        # Use the passed get_model_func or fall back to the centralized one
+        if self.get_model_func:
+            model = self.get_model_func(self.model_name)
+        else:
+            from ...utils import get_model
+            model = get_model(self.model_name)
+        agent = Agent(
+            name=self.agent_name,
+            instructions=comprehensive_analysis_instructions(self.agent_name),
+            model=model,
+            mcp_servers=python_mcp_servers,
+        )
+        return agent
+    async def run_agent(self, python_mcp_servers, source_code: str):
+        # Create single agent for comprehensive analysis
+        comprehensive_agent = await self.create_agent(python_mcp_servers)
+        # Run the complete analysis in one go
+        result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS)
+        # Return the final output
+        return dump_json_record(self.agent_name, result.final_output)
+    async def run_with_mcp_servers(self, source_code: str):
+        async with AsyncExitStack() as stack:
+            python_mcp_servers = [
+                await stack.enter_async_context(
+                    MCPServerStdio(params, client_session_timeout_seconds=120)
+                )
+                for params in python_mcp_server_params
+            ]
+            return await self.run_agent(python_mcp_servers, source_code=source_code)
+    async def run_with_trace(self, source_code: str):
+        trace_name = f"{self.agent_name}-lineage-agent"
+        trace_id = log_trace_id(f"{self.agent_name.lower()}")
+        with trace(trace_name, trace_id=trace_id):
+            return await self.run_with_mcp_servers(source_code=source_code)
+    async def run(self):
+        try:
+            logger.info(f"Starting Python lineage analysis for {self.agent_name}")
+            result = await self.run_with_trace(self.source_code)
+            logger.info(f"Completed Python lineage analysis for {self.agent_name}")
+            return result
+        except Exception as e:
+            logger.error(f"Error running {self.agent_name}: {e}")
+            return {"error": str(e)}
+# Plugin interface functions
+def create_python_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> PythonLineageAgent:
+    """Factory function to create a PythonLineageAgent instance"""
+    return PythonLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func)
+def get_plugin_info() -> Dict[str, Any]:
+    """Return plugin metadata"""
+    return {
+        "name": "python-lineage-agent",
+        "description": "Python lineage analysis agent for parsing and analyzing Python queries",
+        "version": "1.0.0",
+        "author": "Ali Shamsaddinlou",
+        "agent_class": PythonLineageAgent,
+        "factory_function": create_python_lineage_agent,
+    }

lf_algorithm/plugins/python_lineage_agent/mcp_servers/__init__.py ADDED Viewed

File without changes