alishams21 commited on
Commit
e00e744
·
verified ·
1 Parent(s): 035331b

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LICENSE +197 -0
  2. MANIFEST.in +39 -0
  3. README.md +203 -8
  4. cli/README.md +167 -0
  5. cli/__init__.py +5 -0
  6. cli/main.py +238 -0
  7. demo_server.py +321 -0
  8. deploy_setup.py +43 -0
  9. lf_algorithm/__init__.py +46 -0
  10. lf_algorithm/__pycache__/__init__.cpython-313.pyc +0 -0
  11. lf_algorithm/__pycache__/agent_manager.cpython-313.pyc +0 -0
  12. lf_algorithm/__pycache__/framework_agent.cpython-313.pyc +0 -0
  13. lf_algorithm/agent_manager.py +84 -0
  14. lf_algorithm/framework_agent.py +130 -0
  15. lf_algorithm/models/__pycache__/models.cpython-313.pyc +0 -0
  16. lf_algorithm/models/models.py +285 -0
  17. lf_algorithm/plugins/__init__.py +1 -0
  18. lf_algorithm/plugins/__pycache__/__init__.cpython-313.pyc +0 -0
  19. lf_algorithm/plugins/airflow_lineage_agent/__init__.py +1 -0
  20. lf_algorithm/plugins/airflow_lineage_agent/__pycache__/__init__.cpython-313.pyc +0 -0
  21. lf_algorithm/plugins/airflow_lineage_agent/__pycache__/airflow_instructions.cpython-313.pyc +0 -0
  22. lf_algorithm/plugins/airflow_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc +0 -0
  23. lf_algorithm/plugins/airflow_lineage_agent/airflow_instructions.py +98 -0
  24. lf_algorithm/plugins/airflow_lineage_agent/lineage_agent.py +98 -0
  25. lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__init__.py +0 -0
  26. lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc +0 -0
  27. lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc +0 -0
  28. lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/__init__.py +0 -0
  29. lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py +55 -0
  30. lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/templates.py +777 -0
  31. lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_params.py +9 -0
  32. lf_algorithm/plugins/java_lineage_agent/__init__.py +1 -0
  33. lf_algorithm/plugins/java_lineage_agent/__pycache__/__init__.cpython-313.pyc +0 -0
  34. lf_algorithm/plugins/java_lineage_agent/__pycache__/java_instructions.cpython-313.pyc +0 -0
  35. lf_algorithm/plugins/java_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc +0 -0
  36. lf_algorithm/plugins/java_lineage_agent/java_instructions.py +98 -0
  37. lf_algorithm/plugins/java_lineage_agent/lineage_agent.py +97 -0
  38. lf_algorithm/plugins/java_lineage_agent/mcp_servers/__init__.py +0 -0
  39. lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc +0 -0
  40. lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc +0 -0
  41. lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/__init__.py +0 -0
  42. lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py +55 -0
  43. lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/templates.py +605 -0
  44. lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_params.py +9 -0
  45. lf_algorithm/plugins/python_lineage_agent/__init__.py +1 -0
  46. lf_algorithm/plugins/python_lineage_agent/__pycache__/__init__.cpython-313.pyc +0 -0
  47. lf_algorithm/plugins/python_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc +0 -0
  48. lf_algorithm/plugins/python_lineage_agent/__pycache__/python_instructions.cpython-313.pyc +0 -0
  49. lf_algorithm/plugins/python_lineage_agent/lineage_agent.py +97 -0
  50. lf_algorithm/plugins/python_lineage_agent/mcp_servers/__init__.py +0 -0
LICENSE ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity granting the License.
13
+
14
+ "Legal Entity" shall mean the union of the acting entity and all
15
+ other entities that control, are controlled by, or are under common
16
+ control with that entity. For the purposes of this definition,
17
+ "control" means (i) the power, direct or indirect, to cause the
18
+ direction or management of such entity, whether by contract or
19
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
20
+ outstanding shares, or (iii) beneficial ownership of such entity.
21
+
22
+ "You" (or "Your") shall mean an individual or Legal Entity
23
+ exercising permissions granted by this License.
24
+
25
+ "Source" form shall mean the preferred form for making modifications,
26
+ including but not limited to software source code, documentation
27
+ source, and configuration files.
28
+
29
+ "Object" form shall mean any form resulting from mechanical
30
+ transformation or translation of a Source form, including but
31
+ not limited to compiled object code, generated documentation,
32
+ and conversions to other media types.
33
+
34
+ "Work" shall mean the work of authorship, whether in Source or
35
+ Object form, made available under the License, as indicated by a
36
+ copyright notice that is included in or attached to the work
37
+ (which shall not include communications that are clearly marked or
38
+ otherwise designated in writing by the copyright owner as "Not a Contribution").
39
+
40
+ "Contribution" shall mean any work of authorship, including
41
+ the original version of the Work and any modifications or additions
42
+ to that Work or Derivative Works thereof, that is intentionally
43
+ submitted to Licensor for inclusion in the Work by the copyright owner
44
+ or by an individual or Legal Entity authorized to submit on behalf of
45
+ the copyright owner. For the purposes of this definition, "submitted"
46
+ means any form of electronic, verbal, or written communication sent
47
+ to the Licensor or its representatives, including but not limited to
48
+ communication on electronic mailing lists, source code control systems,
49
+ and issue tracking systems that are managed by, or on behalf of, the
50
+ Licensor for the purpose of discussing and improving the Work, but
51
+ excluding communication that is conspicuously marked or otherwise
52
+ designated in writing by the copyright owner as "Not a Contribution."
53
+
54
+ "Contributor" shall mean Licensor and any individual or Legal Entity
55
+ on behalf of whom a Contribution has been received by Licensor and
56
+ subsequently incorporated within the Work.
57
+
58
+ 2. Grant of Copyright License. Subject to the terms and conditions of
59
+ this License, each Contributor hereby grants to You a perpetual,
60
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
61
+ copyright license to use, reproduce, modify, merge, publish,
62
+ distribute, sublicense, and/or sell copies of the Work, and to
63
+ permit persons to whom the Work is furnished to do so, subject to
64
+ the following conditions:
65
+
66
+ The above copyright notice and this permission notice shall be
67
+ included in all copies or substantial portions of the Work.
68
+
69
+ 3. Grant of Patent License. Subject to the terms and conditions of
70
+ this License, each Contributor hereby grants to You a perpetual,
71
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
72
+ (except as stated in this section) patent license to make, have made,
73
+ use, offer to sell, sell, import, and otherwise transfer the Work,
74
+ where such license applies only to those patent claims licensable
75
+ by such Contributor that are necessarily infringed by their
76
+ Contribution(s) alone or by combination of their Contribution(s)
77
+ with the Work to which such Contribution(s) was submitted. If You
78
+ institute patent litigation against any entity (including a
79
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
80
+ or a Contribution incorporated within the Work constitutes direct
81
+ or contributory patent infringement, then any patent licenses
82
+ granted to You under this License for that Work shall terminate
83
+ as of the date such litigation is filed.
84
+
85
+ 4. Redistribution. You may reproduce and distribute copies of the
86
+ Work or Derivative Works thereof in any medium, with or without
87
+ modifications, and in Source or Object form, provided that You
88
+ meet the following conditions:
89
+
90
+ (a) You must give any other recipients of the Work or
91
+ Derivative Works a copy of this License; and
92
+
93
+ (b) You must cause any modified files to carry prominent notices
94
+ stating that You changed the files; and
95
+
96
+ (c) You must retain, in the Source form of any Derivative Works
97
+ that You distribute, all copyright, trademark, patent, and
98
+ other attribution notices from the Source form of the Work,
99
+ excluding those notices that do not pertain to any part of
100
+ the Derivative Works; and
101
+
102
+ (d) If the Work includes a "NOTICE" file as part of its
103
+ distribution, then any Derivative Works that You distribute must
104
+ include a readable copy of the attribution notices contained
105
+ within such NOTICE file, excluding those notices that do not
106
+ pertain to any part of the Derivative Works, in at least one
107
+ of the following places: within a NOTICE file distributed
108
+ as part of the Derivative Works; within the Source form or
109
+ documentation, if provided along with the Derivative Works; or,
110
+ within a display generated by the Derivative Works, if and
111
+ wherever such third-party notices normally appear. The contents
112
+ of the NOTICE file are for informational purposes only and
113
+ do not modify the License. You may add Your own attribution
114
+ notices within Derivative Works that You distribute, alongside
115
+ or as an addendum to the NOTICE text from the Work, provided
116
+ that such additional attribution notices cannot be construed
117
+ as modifying the License.
118
+
119
+ You may add Your own copyright notice to Your modifications and
120
+ may provide additional or different license terms and conditions
121
+ for use, reproduction, or distribution of Your modifications, or
122
+ for any such Derivative Works as a whole, provided Your use,
123
+ reproduction, and distribution of the Work otherwise complies with
124
+ the conditions stated in this License.
125
+
126
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
127
+ any Contribution intentionally submitted for inclusion in the Work
128
+ by You to the Licensor shall be under the terms and conditions of
129
+ this License, without any additional terms or conditions.
130
+ Notwithstanding the above, nothing herein shall supersede or modify
131
+ the terms of any separate license agreement you may have executed
132
+ with Licensor regarding such Contributions.
133
+
134
+ 6. Trademarks. This License does not grant permission to use the trade
135
+ names, trademarks, service marks, or product names of the Licensor,
136
+ except as required for reasonable and customary use in describing the
137
+ origin of the Work and reproducing the content of the NOTICE file.
138
+
139
+ 7. Disclaimer of Warranty. Unless required by applicable law or
140
+ agreed to in writing, Licensor provides the Work (and each
141
+ Contributor provides its Contributions) on an "AS IS" BASIS,
142
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
143
+ implied, including, without limitation, any warranties or conditions
144
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
145
+ PARTICULAR PURPOSE. You are solely responsible for determining the
146
+ appropriateness of using or redistributing the Work and assume any
147
+ risks associated with Your exercise of permissions under this License.
148
+
149
+ 8. Limitation of Liability. In no event and under no legal theory,
150
+ whether in tort (including negligence), contract, or otherwise,
151
+ unless required by applicable law (such as deliberate and grossly
152
+ negligent acts) or agreed to in writing, shall any Contributor be
153
+ liable to You for damages, including any direct, indirect, special,
154
+ incidental, or consequential damages of any character arising as a
155
+ result of this License or out of the use or inability to use the
156
+ Work (including but not limited to damages for loss of goodwill,
157
+ work stoppage, computer failure or malfunction, or any and all
158
+ other commercial damages or losses), even if such Contributor
159
+ has been advised of the possibility of such damages.
160
+
161
+ 9. Accepting Warranty or Additional Liability. When redistributing
162
+ the Work or Derivative Works thereof, You may choose to offer,
163
+ and to charge a fee for, acceptance of support, warranty, indemnity,
164
+ or other liability obligations and/or rights consistent with this
165
+ License. However, in accepting such obligations, You may act only
166
+ on Your own behalf and on Your sole responsibility, not on behalf
167
+ of any other Contributor, and only if You agree to indemnify,
168
+ defend, and hold each Contributor harmless for any liability
169
+ incurred by, or claims asserted against, such Contributor by reason
170
+ of your accepting any such warranty or additional liability.
171
+
172
+ END OF TERMS AND CONDITIONS
173
+
174
+ APPENDIX: How to apply the Apache License to your work.
175
+
176
+ To apply the Apache License to your work, attach the following
177
+ boilerplate notice, with the fields enclosed by brackets "[]"
178
+ replaced with your own identifying information. (Don't include
179
+ the brackets!) The text should be enclosed in the appropriate
180
+ comment syntax for the file format. We also recommend that a
181
+ file or class name and description of purpose be included on the
182
+ same page as the copyright notice for easier identification within
183
+ third-party archives.
184
+
185
+ Copyright [yyyy] [name of copyright owner]
186
+
187
+ Licensed under the Apache License, Version 2.0 (the "License");
188
+ you may not use this file except in compliance with the License.
189
+ You may obtain a copy of the License at
190
+
191
+ http://www.apache.org/licenses/LICENSE-2.0
192
+
193
+ Unless required by applicable law or agreed to in writing, software
194
+ distributed under the License is distributed on an "AS IS" BASIS,
195
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
196
+ See the License for the specific language governing permissions and
197
+ limitations under the License.
MANIFEST.in ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+ include MANIFEST.in
5
+
6
+ # Include images directory for README.md
7
+ recursive-include images *
8
+
9
+ # Include package data
10
+ recursive-include algorithm *.json
11
+ recursive-include algorithm *.yaml
12
+ recursive-include algorithm *.yml
13
+ recursive-include algorithm *.txt
14
+ recursive-include algorithm *.md
15
+ recursive-include cli *.json
16
+ recursive-include cli *.yaml
17
+ recursive-include cli *.yml
18
+ recursive-include cli *.txt
19
+ recursive-include cli *.md
20
+
21
+
22
+ # Include templates and configuration files
23
+ recursive-include lf_algorithm/plugins/*/mcp_servers/*/templates.py
24
+ recursive-include lf_algorithm/plugins/*/mcp_servers/*/mcp_params.py
25
+
26
+ # Exclude development files
27
+ global-exclude *.pyc
28
+ global-exclude *.pyo
29
+ global-exclude __pycache__
30
+ global-exclude .DS_Store
31
+ global-exclude *.log
32
+ global-exclude .pytest_cache
33
+ global-exclude .mypy_cache
34
+ global-exclude .venv
35
+ global-exclude venv
36
+ global-exclude env
37
+ global-exclude .env
38
+ global-exclude .pypirc
39
+ global-exclude .ruff_cache
README.md CHANGED
@@ -1,12 +1,207 @@
1
  ---
2
- title: Lineagentic Flow
3
- emoji:
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: lineagentic-flow
3
+ app_file: start_demo_server.py
 
 
4
  sdk: gradio
5
+ sdk_version: 5.39.0
 
 
6
  ---
7
 
8
+ <div align="center">
9
+ <img src="https://raw.githubusercontent.com/lineagentic/lineagentic-flow/main/images/logo.jpg" alt="Lineagentic Logo" width="880" height="300">
10
+ </div>
11
+
12
+ ## Lineagentic-flow
13
+
14
+ Lineagentic-flow is an agentic ai solution for building end-to-end data lineage across diverse types of data processing scripts across different platforms. It is designed to be modular and customizable, and can be extended to support new data processing script types. In a nutshell this is what it does:
15
+
16
+ ```
17
+ ┌─────────────┐ ┌───────────────────────────────┐ ┌────────────---───┐
18
+ │ source-code │───▶│ lineagentic-flow-algorithm │───▶│ lineage output │
19
+ │ │ │ │ │ │
20
+ └─────────────┘ └───────────────────────────────┘ └──────────────---─┘
21
+ ```
22
+ ### Features
23
+
24
+ - Plugin based design pattern, simple to extend and customize.
25
+ - Command line interface for quick analysis.
26
+ - Support for multiple data processing script types (SQL, Python, Airflow Spark, etc.)
27
+ - Simple demo server to run locally and in huggingface spaces.
28
+
29
+ ## Quick Start
30
+
31
+ ### Installation
32
+
33
+ Install the package from PyPI:
34
+
35
+ ```bash
36
+ pip install lineagentic-flow
37
+ ```
38
+
39
+ ### Basic Usage
40
+
41
+ ```python
42
+ import asyncio
43
+ from lf_algorithm.framework_agent import FrameworkAgent
44
+ import logging
45
+
46
+ logging.basicConfig(
47
+ level=logging.INFO,
48
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
49
+ )
50
+
51
+ async def main():
52
+ # Create an agent for SQL lineage extraction
53
+ agent = FrameworkAgent(
54
+ agent_name="sql-lineage-agent",
55
+ model_name="gpt-4o-mini",
56
+ source_code="SELECT id, name FROM users WHERE active = true"
57
+ )
58
+
59
+ # Run the agent to extract lineage
60
+ result = await agent.run_agent()
61
+ print(result)
62
+
63
+ # Run the example
64
+ asyncio.run(main())
65
+ ```
66
+ ### Supported Agents
67
+
68
+ Following table shows the current development agents in Lineagentic-flow algorithm:
69
+
70
+
71
+ | **Agent Name** | **Done** | **Under Development** | **In Backlog** | **Comment** |
72
+ |----------------------|:--------:|:----------------------:|:--------------:|--------------------------------------|
73
+ | python-lineage_agent | ✓ | | | |
74
+ | airflow_lineage_agent | ✓ | | | |
75
+ | java_lineage_agent | ✓ | | | |
76
+ | spark_lineage_agent | ✓ | | | |
77
+ | sql_lineage_agent | ✓ | | | |
78
+ | flink_lineage_agent | | | ✓ | |
79
+ | beam_lineage_agent | | | ✓ | |
80
+ | shell_lineage_agent | | | ✓ | |
81
+ | scala_lineage_agent | | | ✓ | |
82
+ | dbt_lineage_agent | | | ✓ | |
83
+
84
+
85
+ ### Environment Variables
86
+
87
+ Set your API keys:
88
+
89
+ ```bash
90
+ export OPENAI_API_KEY="your-openai-api-key"
91
+ export HF_TOKEN="your-huggingface-token" # Optional
92
+ ```
93
+
94
+ ## What are the components of Lineagentic-flow?
95
+
96
+ - Algorithm module: This is the brain of the Lineagentic-flow. It contains agents, which are implemented as plugins and acting as chain of thought process to extract lineage from different types of data processing scripts. The module is built using a plugin-based design pattern, allowing you to easily develop and integrate your own custom agents.
97
+
98
+ - CLI module: is for command line around algorithm API and connect to unified service layer
99
+
100
+ - Demo module: is for teams who want to demo Lineagentic-flow in fast and simple way deployable into huggingface spaces.
101
+
102
+ #### Command Line Interface (CLI)
103
+
104
+ Lineagentic-flow provides a powerful CLI tool for quick analysis:
105
+
106
+ ```bash
107
+ # Basic SQL query analysis
108
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT user_id, name FROM users WHERE active = true" --verbose
109
+
110
+ # Analyze with lineage configuration
111
+ lineagentic analyze --agent-name python-lineage-agent --query-file "my_script.py" --verbose
112
+
113
+ ```
114
+ for more details see [CLI documentation](cli/README.md).
115
+
116
+ ### environment variables
117
+
118
+ - HF_TOKEN (HUGGINGFACE_TOKEN)
119
+ - OPENAI_API_KEY
120
+
121
+ ### Architecture
122
+
123
+ The following figure illustrates the architecture behind the Lineagentic-flow, which is essentially a multi-layer architecture of backend and agentic AI algorithm that leverages a chain-of-thought process to construct lineage across various script types.
124
+
125
+ ![Architecture Diagram](https://raw.githubusercontent.com/lineagentic/lineagentic-flow/main/images/architecture.png)
126
+
127
+
128
+ ## Mathematic behind algorithm
129
+
130
+ Following shows mathematic behind each layer of algorithm.
131
+
132
+ ### Agent framework
133
+ The agent framework dose IO operations ,memory management, and prompt engineering according to the script type (T) and its content (C).
134
+
135
+ $$
136
+ P := f(T, C)
137
+ $$
138
+
139
+ ## Runtime orchestration agent
140
+
141
+ The runtime orchestration agent orchestrates the execution of the required agents provided by the agent framework (P) by selecting the appropriate agent (A) and its corresponding task (T).
142
+
143
+ $$
144
+ G=h([\{(A_1, T_1), (A_2, T_2), (A_3, T_3), (A_4, T_4)\}],P)
145
+ $$
146
+
147
+ ## Syntax Analysis Agent
148
+
149
+ Syntax Analysis agent, analyzes the syntactic structure of the raw script to identify subqueries and nested structures and decompose the script into multiple subscripts.
150
+
151
+ $$
152
+ \{sa1,⋯,san\}:=h([A_1,T_1],P)
153
+ $$
154
+
155
+ ## Field Derivation Agent
156
+ The Field Derivation agent processes each subscript from syntax analysis agent to derive field-level mapping relationships and processing logic.
157
+
158
+ $$
159
+ \{fd1,⋯,fdn\}:=h([A_2,T_2],\{sa1,⋯,san\})
160
+ $$
161
+
162
+ ## Operation Tracing Agent
163
+ The Operation Tracing agent analyzes the complex conditions within each subscript identified in syntax analysis agent including filter conditions, join conditions, grouping conditions, and sorting conditions.
164
+
165
+ $$
166
+ \{ot1,⋯,otn\}:=h([A_3,T_3],\{sa1,⋯,san\})
167
+ $$
168
+
169
+ ## Event Composer Agent
170
+ The Event Composer agent consolidates the results from the syntax analysis agent, the field derivation agent and the operation tracing agent to generate the final lineage result.
171
+
172
+ $$
173
+ \{A\}:=h([A_4,T_4],\{sa1,⋯,san\},\{fd1,⋯,fdn\},\{ot1,⋯,otn\})
174
+ $$
175
+
176
+
177
+
178
+ ## Activation and Deployment
179
+
180
+ To simplify the usage of Lineagentic-flow, a Makefile has been created to manage various activation and deployment tasks. You can explore the available targets directly within the Makefile. Here you can find different strategies but for more details look into Makefile.
181
+
182
+ 1- to start demo server:
183
+
184
+ ```bash
185
+ make start-demo-server
186
+ ```
187
+ 2- to do all tests:
188
+
189
+ ```bash
190
+ make test
191
+ ```
192
+ 3- to build package:
193
+
194
+ ```bash
195
+ make build-package
196
+ ```
197
+ 4- to clean all stack:
198
+
199
+ ```bash
200
+ make clean-all-stack
201
+ ```
202
+
203
+ 5- In order to deploy Lineagentic-flow to Hugging Face Spaces, run the following command ( you need to have huggingface account and put secret keys there if you are going to use paid models):
204
+
205
+ ```bash
206
+ make gradio-deploy
207
+ ```
cli/README.md ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Lineagentic-flow CLI
2
+
3
+ A command-line interface for the Lineagentic-flow framework that provides agentic data lineage parsing across various data processing script types.
4
+
5
+ ## Installation
6
+
7
+ The CLI is automatically installed when you install the lineagentic-flow package:
8
+
9
+ ```bash
10
+ pip install -e .
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ The CLI provides two main commands: `analyze` and `field-lineage`.
16
+
17
+ ### Basic Commands
18
+
19
+ #### Analyze Query/Code for Lineage
20
+ ```bash
21
+ lineagentic analyze --agent-name sql-lineage-agent --query "your code here"
22
+ ```
23
+
24
+
25
+ ### Running Analysis
26
+
27
+ #### Using a Specific Agent
28
+ ```bash
29
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT a,b FROM table1"
30
+ ```
31
+
32
+ #### Using a File as Input
33
+ ```bash
34
+ lineagentic analyze --agent-name python-lineage-agent --query-file path/to/your/script.py
35
+ ```
36
+
37
+ #### Specifying a Different Model
38
+ ```bash
39
+ lineagentic analyze --agent-name airflow-lineage-agent --model-name gpt-4o --query "your code here"
40
+ ```
41
+
42
+ #### With Lineage Configuration
43
+ ```bash
44
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --job-namespace "my-namespace" --job-name "my-job"
45
+ ```
46
+
47
+ ### Output Options
48
+
49
+ #### Pretty Print Results
50
+ ```bash
51
+ lineagentic analyze --agent-name sql --query "your code" --pretty
52
+ ```
53
+
54
+ #### Save Results to File
55
+ ```bash
56
+ lineagentic analyze --agent-name sql --query "your code" --output results.json
57
+ ```
58
+
59
+ #### Save Results with Pretty Formatting
60
+ ```bash
61
+ lineagentic analyze --agent-name python --query "your code" --output results.json --pretty
62
+ ```
63
+
64
+ #### Enable Verbose Output
65
+ ```bash
66
+ lineagentic analyze --agent-name sql --query "your code" --verbose
67
+ ```
68
+
69
+ ## Available Agents
70
+
71
+ - **sql-lineage-agent**: Analyzes SQL queries and scripts (default)
72
+ - **airflow-lineage-agent**: Analyzes Apache Airflow DAGs and workflows
73
+ - **spark-lineage-agent**: Analyzes Apache Spark jobs
74
+ - **python-lineage-agent**: Analyzes Python data processing scripts
75
+ - **java-lineage-agent**: Analyzes Java data processing code
76
+
77
+ ## Commands
78
+
79
+ ### `analyze` Command
80
+
81
+ Analyzes a query or code for lineage information.
82
+
83
+ #### Required Arguments
84
+ - Either `--query` or `--query-file` must be specified
85
+
86
+ ### Basic Query Analysis
87
+ ```bash
88
+ # Simple SQL query analysis
89
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT user_id, name FROM users WHERE active = true"
90
+
91
+ # Analyze with specific agent
92
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT a, b FROM table1 JOIN table2 ON table1.id = table2.id"
93
+
94
+ # Analyze Python code
95
+ lineagentic analyze --agent-name python-lineage-agent --query "import pandas as pd; df = pd.read_csv('data.csv'); result = df.groupby('category').sum()"
96
+
97
+ # Analyze Java code
98
+ lineagentic analyze --agent-name java-lineage-agent --query "public class DataProcessor { public void processData() { // processing logic } }"
99
+
100
+ # Analyze Spark code
101
+ lineagentic analyze --agent-name spark-lineage-agent --query "val df = spark.read.csv('data.csv'); val result = df.groupBy('category').agg(sum('value'))"
102
+
103
+ # Analyze Airflow DAG
104
+ lineagentic analyze --agent-name airflow-lineage-agent --query "from airflow import DAG; from airflow.operators.python import PythonOperator; dag = DAG('my_dag')"
105
+ ```
106
+
107
+
108
+ ### Reading from File
109
+ ```bash
110
+ # Analyze query from file
111
+ lineagentic analyze --agent-name sql-lineage-agent --query-file "queries/user_analysis.sql"
112
+
113
+ # Analyze Python script from file
114
+ lineagentic analyze --agent-name python-lineage-agent --query-file "scripts/data_processing.py"
115
+ ```
116
+
117
+ ### Output Options
118
+ ```bash
119
+ # Save results to file
120
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --output "results.json"
121
+
122
+ # Pretty print results
123
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --pretty
124
+
125
+ # Verbose output
126
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --verbose
127
+
128
+ # Don't save to database
129
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --no-save
130
+
131
+ # Don't save to Neo4j
132
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT * FROM users" --no-neo4j
133
+ ```
134
+
135
+
136
+
137
+ ## Common Output Options
138
+
139
+ Both commands support these output options:
140
+
141
+ - `--output`: Output file path for results (JSON format)
142
+ - `--pretty`: Pretty print the output
143
+ - `--verbose`: Enable verbose output
144
+
145
+ ## Error Handling
146
+
147
+ The CLI provides clear error messages for common issues:
148
+
149
+ - Missing required arguments
150
+ - File not found errors
151
+ - Agent execution errors
152
+ - Invalid agent names
153
+
154
+ ## Development
155
+
156
+ To run the CLI in development mode:
157
+
158
+ ```bash
159
+ python -m cli.main --help
160
+ ```
161
+
162
+ To run a specific command:
163
+
164
+ ```bash
165
+ python -m cli.main analyze --agent-name sql --query "SELECT 1" --pretty
166
+ ```
167
+
cli/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ CLI package for lineagentic framework.
3
+ """
4
+
5
+ __version__ = "0.1.0"
cli/main.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Main CLI entry point for lineagentic framework.
4
+ """
5
+
6
+ import asyncio
7
+ import argparse
8
+ import sys
9
+ import os
10
+ import logging
11
+ from pathlib import Path
12
+
13
+ # Add the project root to the Python path
14
+ project_root = Path(__file__).parent.parent
15
+ sys.path.insert(0, str(project_root))
16
+
17
+ from lf_algorithm.framework_agent import FrameworkAgent
18
+
19
+
20
+ def configure_logging(verbose: bool = False, quiet: bool = False):
21
+ """Configure logging for the CLI application."""
22
+ if quiet:
23
+ # Quiet mode: only show errors
24
+ logging.basicConfig(
25
+ level=logging.ERROR,
26
+ format='%(levelname)s: %(message)s'
27
+ )
28
+ elif verbose:
29
+ # Verbose mode: show all logs with detailed format
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
33
+ datefmt='%Y-%m-%d %H:%M:%S'
34
+ )
35
+ else:
36
+ # Normal mode: show only important logs with clean format
37
+ logging.basicConfig(
38
+ level=logging.WARNING, # Only show warnings and errors by default
39
+ format='%(levelname)s: %(message)s'
40
+ )
41
+
42
+ # Set specific loggers to INFO level for better user experience
43
+ logging.getLogger('lf_algorithm').setLevel(logging.INFO)
44
+ logging.getLogger('lf_algorithm.framework_agent').setLevel(logging.INFO)
45
+ logging.getLogger('lf_algorithm.agent_manager').setLevel(logging.INFO)
46
+
47
+ # Suppress noisy server logs from MCP tools
48
+ logging.getLogger('mcp').setLevel(logging.WARNING)
49
+ logging.getLogger('agents.mcp').setLevel(logging.WARNING)
50
+ logging.getLogger('agents.mcp.server').setLevel(logging.WARNING)
51
+ logging.getLogger('agents.mcp.server.stdio').setLevel(logging.WARNING)
52
+ logging.getLogger('agents.mcp.server.stdio.stdio').setLevel(logging.WARNING)
53
+
54
+ # Suppress MCP library logs specifically
55
+ logging.getLogger('mcp.server').setLevel(logging.WARNING)
56
+ logging.getLogger('mcp.server.fastmcp').setLevel(logging.WARNING)
57
+ logging.getLogger('mcp.server.stdio').setLevel(logging.WARNING)
58
+
59
+ # Suppress any logger that contains 'server' in the name
60
+ for logger_name in logging.root.manager.loggerDict:
61
+ if 'server' in logger_name.lower():
62
+ logging.getLogger(logger_name).setLevel(logging.WARNING)
63
+
64
+ # Additional MCP-specific suppressions
65
+ logging.getLogger('mcp.server.stdio.stdio').setLevel(logging.WARNING)
66
+ logging.getLogger('mcp.server.stdio.stdio.stdio').setLevel(logging.WARNING)
67
+
68
+ def create_parser():
69
+ """Create and configure the argument parser."""
70
+ parser = argparse.ArgumentParser(
71
+ description="Lineagentic - Agentic approach for code analysis and lineage extraction",
72
+ formatter_class=argparse.RawDescriptionHelpFormatter,
73
+ epilog="""
74
+ Examples:
75
+
76
+ lineagentic analyze --agent-name sql-lineage-agent --query "SELECT a,b FROM table1"
77
+ lineagentic analyze --agent-name python-lineage-agent --query-file "my_script.py"
78
+ """
79
+ )
80
+
81
+ # Create subparsers for the two main operations
82
+ subparsers = parser.add_subparsers(dest='command', help='Available commands')
83
+
84
+ # Analyze query subparser
85
+ analyze_parser = subparsers.add_parser('analyze', help='Analyze code or query for lineage information')
86
+ analyze_parser.add_argument(
87
+ "--agent-name",
88
+ type=str,
89
+ default="sql",
90
+ help="Name of the agent to use (e.g., sql, airflow, spark, python, java) (default: sql)"
91
+ )
92
+ analyze_parser.add_argument(
93
+ "--model-name",
94
+ type=str,
95
+ default="gpt-4o-mini",
96
+ help="Model to use for the agents (default: gpt-4o-mini)"
97
+ )
98
+ analyze_parser.add_argument(
99
+ "--query",
100
+ type=str,
101
+ help="Code or query to analyze"
102
+ )
103
+ analyze_parser.add_argument(
104
+ "--query-file",
105
+ type=str,
106
+ help="Path to file containing the query/code to analyze"
107
+ )
108
+
109
+ # Common output options
110
+ analyze_parser.add_argument(
111
+ "--output",
112
+ type=str,
113
+ help="Output file path for results (JSON format)"
114
+ )
115
+ analyze_parser.add_argument(
116
+ "--pretty",
117
+ action="store_true",
118
+ help="Pretty print the output"
119
+ )
120
+ analyze_parser.add_argument(
121
+ "--verbose",
122
+ action="store_true",
123
+ help="Enable verbose output with detailed logging"
124
+ )
125
+ analyze_parser.add_argument(
126
+ "--quiet",
127
+ action="store_true",
128
+ help="Suppress all output except errors"
129
+ )
130
+
131
+ return parser
132
+
133
+
134
+ def read_query_file(file_path: str) -> str:
135
+ """Read query from a file."""
136
+ try:
137
+ with open(file_path, 'r', encoding='utf-8') as f:
138
+ return f.read()
139
+ except FileNotFoundError:
140
+ print(f"Error: File '{file_path}' not found.")
141
+ sys.exit(1)
142
+ except Exception as e:
143
+ print(f"Error reading file '{file_path}': {e}")
144
+ sys.exit(1)
145
+
146
+
147
+
148
+
149
+
150
+ def save_output(result, output_file: str = None, pretty: bool = False):
151
+ """Save or print the result."""
152
+ # Convert AgentResult to dict if needed
153
+ if hasattr(result, 'to_dict'):
154
+ result_dict = result.to_dict()
155
+ else:
156
+ result_dict = result
157
+
158
+ if output_file:
159
+ import json
160
+ with open(output_file, 'w', encoding='utf-8') as f:
161
+ json.dump(result_dict, f, indent=2 if pretty else None)
162
+ print(f"Results saved to '{output_file}'")
163
+ else:
164
+ if pretty:
165
+ import json
166
+ print("\n" + "="*50)
167
+ print("ANALYSIS RESULTS")
168
+ print("="*50)
169
+ print(json.dumps(result_dict, indent=2))
170
+ print("="*50)
171
+ else:
172
+ print("\nResults:", result_dict)
173
+
174
+
175
+ async def run_analyze_query(args):
176
+ """Run analyze_query operation."""
177
+ logger = logging.getLogger(__name__)
178
+
179
+ # Get the query
180
+ query = args.query
181
+ if args.query_file:
182
+ query = read_query_file(args.query_file)
183
+
184
+ if not query:
185
+ logger.error("Either --query or --query-file must be specified.")
186
+ sys.exit(1)
187
+
188
+ logger.info(f"Running agent '{args.agent_name}' with query...")
189
+
190
+ try:
191
+ # Create FrameworkAgent instance
192
+ agent = FrameworkAgent(
193
+ agent_name=args.agent_name,
194
+ model_name=args.model_name,
195
+ source_code=query
196
+ )
197
+
198
+ # Run the agent
199
+ result = await agent.run_agent()
200
+
201
+ save_output(result, args.output, args.pretty)
202
+
203
+ except Exception as e:
204
+ logger.error(f"Error running agent '{args.agent_name}': {e}")
205
+ sys.exit(1)
206
+
207
+
208
+
209
+
210
+
211
+ async def main_async():
212
+ """Main CLI function."""
213
+ parser = create_parser()
214
+ args = parser.parse_args()
215
+
216
+ # Check if a command was provided
217
+ if not args.command:
218
+ parser.print_help()
219
+ sys.exit(1)
220
+
221
+ # Configure logging based on verbosity
222
+ configure_logging(verbose=args.verbose, quiet=args.quiet)
223
+
224
+ # Run the appropriate command
225
+ if args.command == 'analyze':
226
+ await run_analyze_query(args)
227
+ else:
228
+ print(f"Unknown command: {args.command}")
229
+ sys.exit(1)
230
+
231
+
232
+ def main():
233
+ """Synchronous wrapper for the async main function."""
234
+ asyncio.run(main_async())
235
+
236
+
237
+ if __name__ == "__main__":
238
+ main()
demo_server.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import json
4
+ import threading
5
+ import time
6
+ import sys
7
+ import os
8
+ import logging
9
+ from typing import Optional, Dict, Any
10
+ from datetime import datetime
11
+
12
+ # Import from the published package
13
+ from lf_algorithm import FrameworkAgent
14
+ from lf_algorithm.utils import write_lineage_log
15
+
16
+ # Configure logging for the demo server
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
20
+ datefmt='%Y-%m-%d %H:%M:%S'
21
+ )
22
+
23
+ class SQLLineageFrontend:
24
+ def __init__(self):
25
+ self.agent_framework = None
26
+ self.current_results = None
27
+ self.current_agent_name = None
28
+ self.log_thread = None
29
+ self.should_stop_logging = False
30
+ self.logger = logging.getLogger(__name__)
31
+
32
+ def get_visualize_link(self) -> str:
33
+ """Generate JSONCrack visualization interface for aggregation data"""
34
+ if self.current_results is None:
35
+ return """
36
+ <div style='text-align: center; padding: 20px; color: #868e96;'>
37
+ <div style='font-size: 16px; margin-bottom: 15px;'>📊 Visualization Ready</div>
38
+ <div style='font-size: 14px; margin-bottom: 20px;'>
39
+ After you run analysis and succeed, you need to got to the following link:<br>
40
+ </div>
41
+ <a href='https://jsoncrack.com/editor' target='_blank' style='color: #007bff; text-decoration: none; font-weight: bold; font-size: 16px;'>
42
+ 🔗 Open editor for simple check and paste the results there
43
+ </a>
44
+ </div>
45
+ """
46
+
47
+ try:
48
+ # Get the aggregation data - now it's directly the current_results
49
+ aggregation_data = self.current_results
50
+
51
+ # Handle different result types
52
+ if isinstance(aggregation_data, str):
53
+ try:
54
+ # Try to parse as JSON first
55
+ parsed_data = json.loads(aggregation_data)
56
+ data_to_encode = parsed_data
57
+ except json.JSONDecodeError:
58
+ # If it's not valid JSON, wrap it in a dict
59
+ data_to_encode = {"aggregation_output": aggregation_data}
60
+ elif hasattr(aggregation_data, 'to_dict'):
61
+ # Handle AgentResult objects
62
+ data_to_encode = aggregation_data.to_dict()
63
+ elif isinstance(aggregation_data, dict):
64
+ data_to_encode = aggregation_data
65
+ else:
66
+ # Fallback for other object types
67
+ data_to_encode = {"aggregation_output": str(aggregation_data)}
68
+
69
+ # Format JSON for display
70
+ formatted_json = json.dumps(data_to_encode, indent=2)
71
+
72
+ return f"""
73
+ <div style='text-align: center; padding: 10px;'>
74
+ <div style='color: #28a745; font-size: 16px; margin-bottom: 15px; font-weight: bold;'>
75
+ ✅ Analysis Complete! Ready for Visualization
76
+ </div>
77
+ <div style='color: #007bff; font-size: 14px; margin-bottom: 20px;'>
78
+ 📋 Steps to visualize your results:<br>
79
+ 1. Click "Open JSONCrack Editor" below<br>
80
+ 2. Click "Copy JSON" button or click the JSON data below to select all<br>
81
+ 3. Paste it into the JSONCrack editor
82
+ </div>
83
+ <a href='https://jsoncrack.com/editor' target='_blank' style='color: #007bff; text-decoration: none; font-weight: bold; font-size: 16px; padding: 10px 20px; border: 2px solid #007bff; border-radius: 5px; display: inline-block; margin-bottom: 15px;'>
84
+ 🔗 Open JSONCrack Editor
85
+ </a>
86
+ <br><br>
87
+ <div style='background: #f8f9fa; border: 1px solid #e0e0e0; border-radius: 5px; padding: 15px; margin: 10px 0;'>
88
+ <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;'>
89
+ <div style='font-weight: bold; color: #333;'>📄 Analysis Results (JSON)</div>
90
+ <button onclick="document.getElementById('json-textarea').select(); document.getElementById('json-textarea').setSelectionRange(0, 99999); navigator.clipboard.writeText(document.getElementById('json-textarea').value).then(() => alert('JSON copied to clipboard!')).catch(() => alert('Failed to copy. Please select and copy manually.'));" style='background: #28a745; color: white; border: none; padding: 8px 16px; border-radius: 4px; cursor: pointer; font-weight: bold; width: 120px;'>📋 Copy JSON</button>
91
+ </div>
92
+ <textarea id="json-textarea" readonly style='background: #ffffff; color: #000000; padding: 12px; border-radius: 3px; border: 1px solid #e0e0e0; font-family: monospace; font-size: 12px; width: 100%; height: 250px; resize: vertical; cursor: text;' onclick="this.select(); this.setSelectionRange(0, 99999);" title="Click to select all JSON">{formatted_json}</textarea>
93
+ </div>
94
+ </div>
95
+ """
96
+ except Exception as e:
97
+ return f"<div style='color: #ff6b6b;'>❌ Error generating visualization data: {str(e)}</div>"
98
+
99
+ def get_logs_html(self) -> str:
100
+ """Generate HTML for live logs display"""
101
+ if self.current_agent_name is None:
102
+ return "<div style='color: #868e96;'>No agent initialized yet</div>"
103
+
104
+ return f"""<div style='background: #f8f9fa; border: 1px solid #e0e0e0; border-radius: 5px; padding: 15px;'>
105
+ <div style='color: #28a745; font-weight: bold; margin-bottom: 10px;'>
106
+ 📝 Logging Status for Agent: {self.current_agent_name}
107
+ </div>
108
+ <div style='color: #6c757d; font-size: 14px; line-height: 1.5;'>
109
+ ✅ <strong>Standard Python Logging Active</strong><br>
110
+ • All logs are being captured by the application's logging system<br>
111
+ • Check your console/terminal for real-time log output<br>
112
+ • Logs include detailed information about agent execution<br>
113
+ • Structured logging with timestamps and log levels<br><br>
114
+
115
+ 📋 <strong>Log Types Available:</strong><br>
116
+ • <span style='color: #007bff;'>INFO</span> - General information and progress<br>
117
+ • <span style='color: #28a745;'>DEBUG</span> - Detailed debugging information<br>
118
+ • <span style='color: #ffc107;'>WARNING</span> - Warning messages<br>
119
+ • <span style='color: #dc3545;'>ERROR</span> - Error messages<br><br>
120
+
121
+ 🔍 <strong>What You'll See:</strong><br>
122
+ • Agent initialization and configuration<br>
123
+ • MCP tool interactions and responses<br>
124
+ • Analysis progress and completion status<br>
125
+ • Any errors or warnings during execution
126
+ </div>
127
+ </div>"""
128
+
129
+ def test_log_writing(self):
130
+ """Test function to write a sample log entry"""
131
+ if self.current_agent_name:
132
+ try:
133
+ write_lineage_log(self.current_agent_name, "test", "Test log entry from frontend")
134
+ self.logger.info(f"Test log written successfully for agent: {self.current_agent_name}")
135
+ return f"✅ Test log written successfully for agent: {self.current_agent_name}! Check your console output."
136
+ except Exception as e:
137
+ self.logger.error(f"Failed to write test log: {e}")
138
+ return f"❌ Failed to write test log: {e}"
139
+ else:
140
+ return "⚠️ Please initialize an agent first by running an analysis"
141
+
142
+ def get_results_info(self) -> str:
143
+ """Get information about the current results"""
144
+ if self.current_results is None:
145
+ return "No results available yet"
146
+
147
+ if isinstance(self.current_results, dict) and "error" in self.current_results:
148
+ return f"Error in results: {self.current_results['error']}"
149
+
150
+ if hasattr(self.current_results, 'to_dict'):
151
+ # AgentResult object
152
+ result_dict = self.current_results.to_dict()
153
+ inputs_count = len(result_dict.get('inputs', []))
154
+ outputs_count = len(result_dict.get('outputs', []))
155
+ return f"✅ Structured results with {inputs_count} input(s) and {outputs_count} output(s)"
156
+
157
+ if isinstance(self.current_results, dict):
158
+ return f"✅ Dictionary results with {len(self.current_results)} keys"
159
+
160
+ return f"✅ Results type: {type(self.current_results).__name__}"
161
+
162
+ async def run_analysis(self, agent_name: str, model_name: str, query: str):
163
+ """Run SQL lineage analysis"""
164
+ try:
165
+ # Validate input
166
+ if not query or not query.strip():
167
+ return "❌ Error: Query cannot be empty. Please provide a valid query for analysis."
168
+
169
+ self.logger.info(f"Starting analysis with agent: {agent_name}, model: {model_name}")
170
+
171
+ # Initialize the agent framework with simplified constructor
172
+ self.agent_framework = FrameworkAgent(
173
+ agent_name=agent_name,
174
+ model_name=model_name,
175
+ source_code=query.strip()
176
+ )
177
+ self.current_agent_name = agent_name
178
+
179
+ self.logger.info(f"Agent framework initialized. Running analysis...")
180
+
181
+ # Run the analysis using the structured results method
182
+ results = await self.agent_framework.run_agent()
183
+ self.current_results = results
184
+
185
+ # Check if we got an error response
186
+ if isinstance(results, dict) and "error" in results:
187
+ self.logger.error(f"Analysis failed: {results['error']}")
188
+ return f"❌ Analysis failed: {results['error']}"
189
+
190
+ self.logger.info(f"Analysis completed successfully for agent: {agent_name}")
191
+
192
+ return f"""✅ Analysis completed successfully! Results are now available in the visualization section.
193
+ Click 'Open JSONCrack Editor' to visualize your data lineage.
194
+
195
+ If you want to set up your own local development environment or deploy this in production,
196
+ please refer to the GitHub repository mentioned above."""
197
+
198
+ except ValueError as ve:
199
+ self.logger.error(f"Validation error: {ve}")
200
+ return f"❌ Validation error: {str(ve)}"
201
+ except Exception as e:
202
+ self.logger.error(f"Error running analysis: {e}")
203
+ return f"❌ Error running analysis: {str(e)}"
204
+
205
+ def run_analysis_sync(self, agent_name: str, model_name: str, query: str):
206
+ """Synchronous wrapper for run_analysis"""
207
+ return asyncio.run(self.run_analysis(agent_name, model_name, query))
208
+
209
+ def create_ui(self):
210
+ """Create the Gradio interface"""
211
+ with gr.Blocks(title="SQL Lineage Analysis", fill_width=True) as ui:
212
+
213
+ gr.Markdown('<div style="text-align: center;font-size:24px">🔍 Demo Lineagentic-Flow</div>')
214
+ gr.Markdown('<div style="text-align: center;font-size:14px">Analyze data lineage with AI-powered agents</div>')
215
+ gr.Markdown('<div style="text-align: center;font-size:14px">Check out agent types for supporting script types</div>')
216
+ gr.Markdown('<div style="text-align: center;font-size:14px">For local and production runs, check out the repo: <a href="https://github.com/lineagentic/lineagentic-flow" target="_blank" style="color: #007bff; text-decoration: none; font-weight: bold;">🔗 https://github.com/lineagentic/lineagentic-flow</a></div>')
217
+
218
+ with gr.Row():
219
+ # Left column - Configuration and Query
220
+ with gr.Column(scale=1):
221
+ gr.Markdown("### 1. Agent Configuration")
222
+ agent_dropdown = gr.Dropdown(
223
+ label="Agent Type",
224
+ choices=[
225
+ "sql-lineage-agent",
226
+ "python-lineage-agent",
227
+ "airflow-lineage-agent",
228
+ "java-lineage-agent",
229
+ "spark-lineage-agent"
230
+ ],
231
+ value="sql-lineage-agent"
232
+ )
233
+ model_dropdown = gr.Dropdown(
234
+ label="Model",
235
+ choices=[
236
+ "gpt-4o-mini",
237
+ "gpt-4o",
238
+ "deepseek-coder",
239
+ "deepseek-chat",
240
+ "gemini-pro"
241
+ ],
242
+ value="gpt-4o-mini"
243
+ )
244
+
245
+ gr.Markdown("### 2. Query for Lineage Analysis")
246
+ query_input = gr.Textbox(
247
+ label="Query",
248
+ placeholder="Enter your SQL query here...",
249
+ lines=9,
250
+ max_lines=15
251
+ )
252
+
253
+ analyze_button = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
254
+ status_output = gr.Textbox(label="Status", interactive=False)
255
+
256
+ # Right column - Visualization and Logs
257
+ with gr.Column(scale=1):
258
+ gr.Markdown("### 3. Results Information")
259
+ results_info = gr.Textbox(
260
+ label="Results Status",
261
+ value=self.get_results_info(),
262
+ interactive=False
263
+ )
264
+
265
+ gr.Markdown("### 4. Visualize Results")
266
+ gr.Markdown("📊 After successful analysis, visualize your results in demo editor")
267
+ visualize_html = gr.HTML(self.get_visualize_link())
268
+
269
+ gr.Markdown("### 5. Logging Information")
270
+ logs_html = gr.HTML(self.get_logs_html())
271
+ test_log_button = gr.Button("Test Log Writing", variant="secondary", size="sm")
272
+
273
+ # Auto-refresh logs every 5 seconds
274
+ refresh_logs = gr.Button("🔄 Refresh Logs", variant="secondary", size="sm")
275
+ refresh_results = gr.Button("🔄 Refresh Results Info", variant="secondary", size="sm")
276
+
277
+ # Event handlers
278
+ def run_analysis_and_update(agent_name, model_name, query):
279
+ """Run analysis and update visualization"""
280
+ # Run the analysis
281
+ status_result = self.run_analysis_sync(agent_name, model_name, query)
282
+ # Update visualization, logs, and results info
283
+ viz_html = self.get_visualize_link()
284
+ logs_html = self.get_logs_html()
285
+ results_info = self.get_results_info()
286
+ return status_result, results_info, viz_html, logs_html
287
+
288
+ analyze_button.click(
289
+ fn=run_analysis_and_update,
290
+ inputs=[agent_dropdown, model_dropdown, query_input],
291
+ outputs=[status_output, results_info, visualize_html, logs_html]
292
+ )
293
+
294
+ test_log_button.click(
295
+ fn=self.test_log_writing,
296
+ inputs=[],
297
+ outputs=[status_output]
298
+ )
299
+
300
+ refresh_logs.click(
301
+ fn=self.get_logs_html,
302
+ inputs=[],
303
+ outputs=[logs_html]
304
+ )
305
+
306
+ refresh_results.click(
307
+ fn=self.get_results_info,
308
+ inputs=[],
309
+ outputs=[results_info]
310
+ )
311
+
312
+ return ui
313
+
314
+ def run(self):
315
+ """Launch the Gradio interface"""
316
+ ui = self.create_ui()
317
+ ui.launch(share=False, inbrowser=True)
318
+
319
+ if __name__ == "__main__":
320
+ frontend = SQLLineageFrontend()
321
+ frontend.run()
deploy_setup.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Deployment setup script for Hugging Face Spaces
4
+ This script installs the local package after all files are copied
5
+ """
6
+
7
+ import subprocess
8
+ import sys
9
+ import os
10
+
11
+ def install_local_package():
12
+ """Install the local package in editable mode"""
13
+ try:
14
+ print("📦 Installing local lineagentic-flow package...")
15
+
16
+ # First, try to install in editable mode
17
+ result = subprocess.run([
18
+ sys.executable, "-m", "pip", "install", "-e", "."
19
+ ], capture_output=True, text=True, cwd=os.getcwd())
20
+
21
+ if result.returncode == 0:
22
+ print("✅ Local package installed successfully!")
23
+
24
+ # Verify that entry points are registered
25
+ try:
26
+ import importlib.metadata
27
+ entry_points = list(importlib.metadata.entry_points(group='lineagentic.lf_algorithm.plugins'))
28
+ print(f"✅ Found {len(entry_points)} registered plugins:")
29
+ for ep in entry_points:
30
+ print(f" - {ep.name}")
31
+ return True
32
+ except Exception as e:
33
+ print(f"⚠️ Warning: Could not verify entry points: {e}")
34
+ return True
35
+ else:
36
+ print(f"❌ Failed to install local package: {result.stderr}")
37
+ return False
38
+ except Exception as e:
39
+ print(f"❌ Error installing local package: {e}")
40
+ return False
41
+
42
+ if __name__ == "__main__":
43
+ install_local_package()
lf_algorithm/__init__.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # lf_algorithm/__init__.py
2
+ import logging
3
+
4
+ # Add NullHandler to prevent "No handler could be found" warnings
5
+ # This is the only logging configuration the library should do
6
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
7
+
8
+ from .framework_agent import FrameworkAgent
9
+ from .utils import write_lineage_log
10
+ from .utils.file_utils import dump_json_record, read_json_records, clear_json_file, get_file_stats
11
+ from .utils.tracers import LogTracer, log_trace_id
12
+ from .models.models import AgentResult
13
+ from .plugins.sql_lineage_agent.lineage_agent import SqlLineageAgent, create_sql_lineage_agent, get_plugin_info as get_sql_plugin_info
14
+ from .plugins.python_lineage_agent.lineage_agent import PythonLineageAgent, create_python_lineage_agent, get_plugin_info as get_python_plugin_info
15
+ from .plugins.airflow_lineage_agent.lineage_agent import AirflowLineageAgent, create_airflow_lineage_agent, get_plugin_info as get_airflow_plugin_info
16
+ from .plugins.java_lineage_agent.lineage_agent import JavaLineageAgent, create_java_lineage_agent, get_plugin_info as get_java_plugin_info
17
+ from .plugins.spark_lineage_agent.lineage_agent import SparkLineageAgent, create_spark_lineage_agent, get_plugin_info as get_spark_plugin_info
18
+
19
+ __version__ = "0.1.0"
20
+
21
+ __all__ = [
22
+ 'FrameworkAgent',
23
+ 'AgentResult',
24
+ 'write_lineage_log',
25
+ 'dump_json_record',
26
+ 'read_json_records',
27
+ 'clear_json_file',
28
+ 'get_file_stats',
29
+ 'LogTracer',
30
+ 'log_trace_id',
31
+ 'SqlLineageAgent',
32
+ 'create_sql_lineage_agent',
33
+ 'get_sql_plugin_info',
34
+ 'PythonLineageAgent',
35
+ 'create_python_lineage_agent',
36
+ 'get_python_plugin_info',
37
+ 'AirflowLineageAgent',
38
+ 'create_airflow_lineage_agent',
39
+ 'get_airflow_plugin_info',
40
+ 'JavaLineageAgent',
41
+ 'create_java_lineage_agent',
42
+ 'get_java_plugin_info',
43
+ 'SparkLineageAgent',
44
+ 'create_spark_lineage_agent',
45
+ 'get_spark_plugin_info'
46
+ ]
lf_algorithm/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.75 kB). View file
 
lf_algorithm/__pycache__/agent_manager.cpython-313.pyc ADDED
Binary file (4.56 kB). View file
 
lf_algorithm/__pycache__/framework_agent.cpython-313.pyc ADDED
Binary file (5.79 kB). View file
 
lf_algorithm/agent_manager.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.metadata
2
+ from typing import Dict, Any, Optional, Type, Callable
3
+
4
+ from .utils import get_logger, get_model, validate_api_keys
5
+
6
+ logger = get_logger(__name__)
7
+
8
+
9
+ class AgentManager:
10
+ """Manages plugin discovery and loading for the FrameworkAgent"""
11
+
12
+ def __init__(self):
13
+ self.agents: Dict[str, Dict[str, Any]] = {}
14
+ self.agent_factories: Dict[str, Callable] = {}
15
+ self._load_plugins()
16
+ # Validate API keys on initialization
17
+ validate_api_keys()
18
+
19
+ def _load_plugins(self):
20
+ """Load all available agents plugins using entry points"""
21
+ try:
22
+ # Load plugins from the 'lineagentic.lf_algorithm.plugins' entry point group
23
+ for entry_point in importlib.metadata.entry_points(group='lineagentic.lf_algorithm.plugins'):
24
+ try:
25
+ agent_info = entry_point.load()
26
+ if callable(agent_info):
27
+ # If it's a function, assume it returns plugin info
28
+ agent_data = agent_info()
29
+ else:
30
+ # If it's already a dict/object
31
+ agent_data = agent_info
32
+
33
+ agent_name = agent_data.get('name', entry_point.name)
34
+ self.agents[agent_name] = agent_data
35
+
36
+ # Store the factory function if available
37
+ if 'factory_function' in agent_data:
38
+ self.agent_factories[agent_name] = agent_data['factory_function']
39
+
40
+ logger.info(f"Loaded plugin: {agent_name}")
41
+
42
+ except Exception as e:
43
+ logger.error(f"Failed to load plugin {entry_point.name}: {e}")
44
+
45
+ except Exception as e:
46
+ logger.error(f"Error loading plugins: {e}")
47
+
48
+ def get_agent(self, agent_name: str) -> Optional[Dict[str, Any]]:
49
+ """Get agent information by name"""
50
+ return self.agents.get(agent_name)
51
+
52
+ def list_agents(self) -> Dict[str, Dict[str, Any]]:
53
+ """List all available agents"""
54
+ return self.agents.copy()
55
+
56
+ def create_agent(self, agent_name: str, **kwargs) -> Any:
57
+ """Create an agent instance using the agent's factory function"""
58
+ if agent_name not in self.agent_factories:
59
+ raise ValueError(f"Agent '{agent_name}' not found or has no factory function")
60
+
61
+ factory = self.agent_factories[agent_name]
62
+ # Pass the get_model function to the agent factory
63
+ kwargs['get_model_func'] = get_model
64
+ return factory(agent_name=agent_name, **kwargs)
65
+
66
+ def get_supported_operations(self) -> Dict[str, list]:
67
+ """Get all supported operations from all agents"""
68
+ operations = {}
69
+ for agent_name, agent_info in self.agents.items():
70
+ supported_ops = agent_info.get('supported_operations', [])
71
+ for op in supported_ops:
72
+ if op not in operations:
73
+ operations[op] = []
74
+ operations[op].append(agent_name)
75
+ return operations
76
+
77
+ def get_agents_for_operation(self, operation: str) -> list:
78
+ """Get all agents that support a specific operation"""
79
+ supported_ops = self.get_supported_operations()
80
+ return supported_ops.get(operation, [])
81
+
82
+
83
+ # Global agent manager instance
84
+ agent_manager = AgentManager()
lf_algorithm/framework_agent.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import sys
3
+ import os
4
+ from typing import Dict, Any, List, Optional, Union
5
+ import json
6
+ from datetime import datetime
7
+ import uuid
8
+
9
+ from .utils import get_logger, get_model, validate_api_keys
10
+
11
+ logger = get_logger(__name__)
12
+
13
+ from .utils.tracers import LogTracer
14
+ from .agent_manager import agent_manager
15
+ from agents import add_trace_processor
16
+ from .models.models import AgentResult
17
+
18
+
19
+ class FrameworkAgent:
20
+
21
+ def __init__(self, agent_name: str, model_name: str = "gpt-4o-mini",
22
+ source_code: str = None):
23
+ """
24
+ Initialize the Agent Framework.
25
+
26
+ Args:
27
+ agent_name (str): The name of the agent to use
28
+ model_name (str): The model to use for the agents (default: "gpt-4o-mini")
29
+ lineage_config (LineageConfig): Configuration for OpenLineage event metadata
30
+
31
+ Raises:
32
+ ValueError: If lineage_config is not provided
33
+ """
34
+ if not source_code:
35
+ raise ValueError("source_code is required and cannot be None")
36
+
37
+ self.agent_name = agent_name
38
+ self.model_name = model_name
39
+ self.source_code = source_code
40
+ self.agent_manager = agent_manager
41
+
42
+ # Validate API keys on initialization
43
+ validate_api_keys()
44
+
45
+ logger.info(f"FrameworkAgent initialized: agent_name={agent_name}, model_name={model_name}")
46
+
47
+
48
+
49
+ async def run_agent_plugin(self, **kwargs) -> Dict[str, Any]:
50
+ """
51
+ Run a specific agent with a source code.
52
+
53
+ Args:
54
+ **kwargs: Additional arguments to pass to the agent
55
+
56
+ Returns:
57
+ Dict[str, Any]: The results from the agent with merged OpenLineage metadata
58
+ """
59
+ logger.info(f"Starting agent: {self.agent_name} with model: {self.model_name}")
60
+ add_trace_processor(LogTracer())
61
+
62
+ try:
63
+ # Create the agent using the plugin's factory function
64
+ logger.info(f"Creating agent instance for: {self.agent_name}")
65
+ agent = self.agent_manager.create_agent(
66
+ agent_name=self.agent_name,
67
+ source_code=self.source_code,
68
+ model_name=self.model_name,
69
+ **kwargs
70
+ )
71
+
72
+ # Run the agent
73
+ logger.info(f"Running agent: {self.agent_name}")
74
+ results = await agent.run()
75
+ logger.info(f"Agent {self.agent_name} completed successfully")
76
+
77
+ return results
78
+
79
+ except Exception as e:
80
+ logger.error(f"Error running agent {self.agent_name}: {e}")
81
+ return {"error": str(e)}
82
+
83
+ def map_results_to_objects(self, results: Dict[str, Any]) -> Union[AgentResult, Dict[str, Any]]:
84
+ """
85
+ Map JSON results from agent to structured AgentResult objects.
86
+
87
+ Args:
88
+ results: Dictionary containing the agent results
89
+
90
+ Returns:
91
+ AgentResult: Structured object representation of the results, or original dict if mapping fails
92
+ """
93
+ try:
94
+
95
+ # Check if it's an error response
96
+ if "error" in results:
97
+ return results
98
+
99
+ # Check if it has the expected structure for lineage results
100
+ if "inputs" in results and "outputs" in results:
101
+ return AgentResult.from_dict(results)
102
+
103
+ # If it doesn't match the expected structure, return as-is
104
+ return results
105
+
106
+ except Exception as e:
107
+ logger.error(f"Error mapping results to objects: {e}")
108
+ return results
109
+
110
+ async def run_agent(self, **kwargs) -> Union[AgentResult, Dict[str, Any]]:
111
+ """
112
+ Run a specific agent and return structured objects instead of raw dictionaries.
113
+
114
+ Args:
115
+ **kwargs: Additional arguments to pass to the agent
116
+
117
+ Returns:
118
+ Union[AgentResult, Dict[str, Any]]: Structured AgentResult object or error dict
119
+ """
120
+ logger.info(f"Starting run_agent for {self.agent_name}")
121
+ raw_results = await self.run_agent_plugin(**kwargs)
122
+ mapped_results = self.map_results_to_objects(raw_results)
123
+ logger.info(f"Agent {self.agent_name} completed. Results type: {type(mapped_results)}")
124
+ if hasattr(mapped_results, 'to_dict'):
125
+ logger.info(f"Mapped results: {mapped_results.to_dict()}")
126
+ else:
127
+ logger.info(f"Raw results: {mapped_results}")
128
+ return mapped_results
129
+
130
+
lf_algorithm/models/__pycache__/models.cpython-313.pyc ADDED
Binary file (16.1 kB). View file
 
lf_algorithm/models/models.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent result models for mapping JSON responses from lineage agents.
3
+
4
+ This module contains classes for representing the structured results returned
5
+ by lineage analysis agents in a type-safe manner.
6
+ """
7
+
8
+ from typing import Dict, Any, List, Optional
9
+
10
+
11
+ class SchemaField:
12
+ """Schema field configuration for agent results"""
13
+
14
+ def __init__(self, name: str, type: str, description: str):
15
+ self.name = name
16
+ self.type = type
17
+ self.description = description
18
+
19
+ @classmethod
20
+ def from_dict(cls, data: Dict[str, Any]) -> 'SchemaField':
21
+ """Create SchemaField from dictionary"""
22
+ return cls(
23
+ name=data.get('name', ''),
24
+ type=data.get('type', ''),
25
+ description=data.get('description', '')
26
+ )
27
+
28
+ def to_dict(self) -> Dict[str, Any]:
29
+ """Convert to dictionary"""
30
+ return {
31
+ 'name': self.name,
32
+ 'type': self.type,
33
+ 'description': self.description
34
+ }
35
+
36
+
37
+ class Schema:
38
+ """Schema configuration for agent results"""
39
+
40
+ def __init__(self, fields: List[SchemaField]):
41
+ self.fields = fields
42
+
43
+ @classmethod
44
+ def from_dict(cls, data: Dict[str, Any]) -> 'Schema':
45
+ """Create Schema from dictionary"""
46
+ fields = [SchemaField.from_dict(field) for field in data.get('fields', [])]
47
+ return cls(fields=fields)
48
+
49
+ def to_dict(self) -> Dict[str, Any]:
50
+ """Convert to dictionary"""
51
+ return {
52
+ 'fields': [field.to_dict() for field in self.fields]
53
+ }
54
+
55
+
56
+ class Transformation:
57
+ """Transformation configuration for column lineage"""
58
+
59
+ def __init__(self, type: str, subtype: str, description: str, masking: bool = False):
60
+ self.type = type
61
+ self.subtype = subtype
62
+ self.description = description
63
+ self.masking = masking
64
+
65
+ @classmethod
66
+ def from_dict(cls, data: Dict[str, Any]) -> 'Transformation':
67
+ """Create Transformation from dictionary"""
68
+ return cls(
69
+ type=data.get('type', ''),
70
+ subtype=data.get('subtype', ''),
71
+ description=data.get('description', ''),
72
+ masking=data.get('masking', False)
73
+ )
74
+
75
+ def to_dict(self) -> Dict[str, Any]:
76
+ """Convert to dictionary"""
77
+ return {
78
+ 'type': self.type,
79
+ 'subtype': self.subtype,
80
+ 'description': self.description,
81
+ 'masking': self.masking
82
+ }
83
+
84
+
85
+ class InputField:
86
+ """Input field configuration for column lineage"""
87
+
88
+ def __init__(self, namespace: str, name: str, field: str,
89
+ transformations: List[Transformation]):
90
+ self.namespace = namespace
91
+ self.name = name
92
+ self.field = field
93
+ self.transformations = transformations
94
+
95
+ @classmethod
96
+ def from_dict(cls, data: Dict[str, Any]) -> 'InputField':
97
+ """Create InputField from dictionary"""
98
+ transformations = [Transformation.from_dict(t) for t in data.get('transformations', [])]
99
+ return cls(
100
+ namespace=data.get('namespace', ''),
101
+ name=data.get('name', ''),
102
+ field=data.get('field', ''),
103
+ transformations=transformations
104
+ )
105
+
106
+ def to_dict(self) -> Dict[str, Any]:
107
+ """Convert to dictionary"""
108
+ return {
109
+ 'namespace': self.namespace,
110
+ 'name': self.name,
111
+ 'field': self.field,
112
+ 'transformations': [t.to_dict() for t in self.transformations]
113
+ }
114
+
115
+
116
+ class ColumnLineageField:
117
+ """Column lineage field configuration"""
118
+
119
+ def __init__(self, input_fields: List[InputField]):
120
+ self.input_fields = input_fields
121
+
122
+ @classmethod
123
+ def from_dict(cls, data: Dict[str, Any]) -> 'ColumnLineageField':
124
+ """Create ColumnLineageField from dictionary"""
125
+ input_fields = [InputField.from_dict(field) for field in data.get('inputFields', [])]
126
+ return cls(input_fields=input_fields)
127
+
128
+ def to_dict(self) -> Dict[str, Any]:
129
+ """Convert to dictionary"""
130
+ return {
131
+ 'inputFields': [field.to_dict() for field in self.input_fields]
132
+ }
133
+
134
+
135
+ class ColumnLineage:
136
+ """Column lineage configuration"""
137
+
138
+ def __init__(self, fields: Dict[str, ColumnLineageField]):
139
+ self.fields = fields
140
+
141
+ @classmethod
142
+ def from_dict(cls, data: Dict[str, Any]) -> 'ColumnLineage':
143
+ """Create ColumnLineage from dictionary"""
144
+ fields = {
145
+ field_name: ColumnLineageField.from_dict(field_data)
146
+ for field_name, field_data in data.get('fields', {}).items()
147
+ }
148
+ return cls(fields=fields)
149
+
150
+ def to_dict(self) -> Dict[str, Any]:
151
+ """Convert to dictionary"""
152
+ return {
153
+ 'fields': {
154
+ field_name: field_data.to_dict()
155
+ for field_name, field_data in self.fields.items()
156
+ }
157
+ }
158
+
159
+
160
+ class InputFacets:
161
+ """Input facets configuration for agent results"""
162
+
163
+ def __init__(self, schema: Optional[Schema] = None):
164
+ self.schema = schema
165
+
166
+ @classmethod
167
+ def from_dict(cls, data: Dict[str, Any]) -> 'InputFacets':
168
+ """Create InputFacets from dictionary"""
169
+ schema = Schema.from_dict(data.get('schema', {})) if data.get('schema') else None
170
+ return cls(schema=schema)
171
+
172
+ def to_dict(self) -> Dict[str, Any]:
173
+ """Convert to dictionary"""
174
+ result = {}
175
+ if self.schema:
176
+ result['schema'] = self.schema.to_dict()
177
+ return result
178
+
179
+
180
+ class Input:
181
+ """Input configuration for agent results"""
182
+
183
+ def __init__(self, namespace: str, name: str, facets: Optional[InputFacets] = None):
184
+ self.namespace = namespace
185
+ self.name = name
186
+ self.facets = facets
187
+
188
+ @classmethod
189
+ def from_dict(cls, data: Dict[str, Any]) -> 'Input':
190
+ """Create Input from dictionary"""
191
+ facets = InputFacets.from_dict(data.get('facets', {})) if data.get('facets') else None
192
+ return cls(
193
+ namespace=data.get('namespace', ''),
194
+ name=data.get('name', ''),
195
+ facets=facets
196
+ )
197
+
198
+ def to_dict(self) -> Dict[str, Any]:
199
+ """Convert to dictionary"""
200
+ result = {
201
+ 'namespace': self.namespace,
202
+ 'name': self.name
203
+ }
204
+ if self.facets:
205
+ result['facets'] = self.facets.to_dict()
206
+ return result
207
+
208
+
209
+ class OutputFacets:
210
+ """Output facets configuration for agent results"""
211
+
212
+ def __init__(self, column_lineage: Optional[ColumnLineage] = None):
213
+ self.column_lineage = column_lineage
214
+
215
+ @classmethod
216
+ def from_dict(cls, data: Dict[str, Any]) -> 'OutputFacets':
217
+ """Create OutputFacets from dictionary"""
218
+ column_lineage = ColumnLineage.from_dict(data.get('columnLineage', {})) if data.get('columnLineage') else None
219
+ return cls(column_lineage=column_lineage)
220
+
221
+ def to_dict(self) -> Dict[str, Any]:
222
+ """Convert to dictionary"""
223
+ result = {}
224
+ if self.column_lineage:
225
+ result['columnLineage'] = self.column_lineage.to_dict()
226
+ return result
227
+
228
+
229
+ class Output:
230
+ """Output configuration for agent results"""
231
+
232
+ def __init__(self, namespace: str, name: str, facets: Optional[OutputFacets] = None):
233
+ self.namespace = namespace
234
+ self.name = name
235
+ self.facets = facets
236
+
237
+ @classmethod
238
+ def from_dict(cls, data: Dict[str, Any]) -> 'Output':
239
+ """Create Output from dictionary"""
240
+ facets = OutputFacets.from_dict(data.get('facets', {})) if data.get('facets') else None
241
+ return cls(
242
+ namespace=data.get('namespace', ''),
243
+ name=data.get('name', ''),
244
+ facets=facets
245
+ )
246
+
247
+ def to_dict(self) -> Dict[str, Any]:
248
+ """Convert to dictionary"""
249
+ result = {
250
+ 'namespace': self.namespace,
251
+ 'name': self.name
252
+ }
253
+ if self.facets:
254
+ result['facets'] = self.facets.to_dict()
255
+ return result
256
+
257
+
258
+ class AgentResult:
259
+ """Main result class for agent lineage analysis"""
260
+
261
+ def __init__(self, inputs: List[Input], outputs: List[Output]):
262
+ self.inputs = inputs
263
+ self.outputs = outputs
264
+
265
+ @classmethod
266
+ def from_dict(cls, data: Dict[str, Any]) -> 'AgentResult':
267
+ """Create AgentResult from dictionary"""
268
+ inputs = [Input.from_dict(input_data) for input_data in data.get('inputs', [])]
269
+ outputs = [Output.from_dict(output_data) for output_data in data.get('outputs', [])]
270
+ return cls(inputs=inputs, outputs=outputs)
271
+
272
+ def to_dict(self) -> Dict[str, Any]:
273
+ """Convert to dictionary"""
274
+ return {
275
+ 'inputs': [input_obj.to_dict() for input_obj in self.inputs],
276
+ 'outputs': [output_obj.to_dict() for output_obj in self.outputs]
277
+ }
278
+
279
+ def __str__(self) -> str:
280
+ """String representation"""
281
+ return f"AgentResult(inputs={len(self.inputs)}, outputs={len(self.outputs)})"
282
+
283
+ def __repr__(self) -> str:
284
+ """Detailed string representation"""
285
+ return f"AgentResult(inputs={self.inputs}, outputs={self.outputs})"
lf_algorithm/plugins/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Plugin system for FrameworkAgent
lf_algorithm/plugins/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (187 Bytes). View file
 
lf_algorithm/plugins/airflow_lineage_agent/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
lf_algorithm/plugins/airflow_lineage_agent/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (209 Bytes). View file
 
lf_algorithm/plugins/airflow_lineage_agent/__pycache__/airflow_instructions.cpython-313.pyc ADDED
Binary file (5.21 kB). View file
 
lf_algorithm/plugins/airflow_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc ADDED
Binary file (5.95 kB). View file
 
lf_algorithm/plugins/airflow_lineage_agent/airflow_instructions.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def comprehensive_analysis_instructions(name: str):
2
+ return f"""
3
+ You are the {name} Airflow lineage analysis agent.
4
+
5
+ **Your Task:** Perform complete Airflow DAG lineage analysis in a single comprehensive process.
6
+
7
+ **Complete Analysis Process:**
8
+
9
+ **Step 1: Syntax Analysis**
10
+ 1. Call the airflow_lineage_syntax_analysis() MCP tool to get expert instructions
11
+ 2. Follow those instructions exactly to analyze the Airflow DAG structure
12
+ 3. Store the syntax analysis results for use in subsequent steps
13
+
14
+ **Step 2: Field Derivation**
15
+ 1. Call the airflow_lineage_field_derivation() MCP tool to get expert instructions
16
+ 2. Use the syntax analysis results from Step 1 to inform your field mapping analysis
17
+ 3. Follow the MCP tool instructions exactly to analyze field mappings and transformations
18
+ 4. Store the field derivation results
19
+
20
+ **Step 3: Operation Tracing**
21
+ 1. Call the airflow_lineage_operation_tracing() MCP tool to get expert instructions
22
+ 2. Use the syntax analysis results from Step 1 to inform your operation analysis
23
+ 3. Follow the MCP tool instructions exactly to analyze logical operations and operators
24
+ 4. Store the operation tracing results
25
+
26
+ **Step 4: Event Composition**
27
+ 1. Call the airflow_lineage_event_composer() MCP tool to get expert instructions
28
+ 2. Combine all previous analysis results (syntax, field derivation, operation tracing)
29
+ 3. Follow the MCP tool instructions exactly to compose the final OpenLineage event
30
+ 4. Return the complete OpenLineage event
31
+
32
+ **Important Guidelines:**
33
+ - Each MCP tool contains detailed instructions, examples, and output format requirements
34
+ - Follow the MCP tool instructions precisely for each step
35
+ - Maintain context between steps - use results from earlier steps to inform later analysis
36
+ - Ensure the final output is a complete, properly formatted OpenLineage event
37
+ - If any step fails, provide clear error information and stop the process
38
+
39
+ **Workflow Summary:**
40
+ Syntax Analysis → Field Derivation → Operation Tracing → Event Composition → Final Output
41
+ """
42
+
43
+ # Keep the individual instructions for backward compatibility if needed
44
+ def syntax_analysis_instructions(name: str):
45
+ return f"""
46
+ You are the {name} Airflow lineage analysis agent.
47
+
48
+ **Your Task:** Analyze the provided Airflow DAG for syntax structure.
49
+
50
+ **Process:**
51
+ 1. Call the airflow_lineage_syntax_analysis() MCP tool to get expert instructions
52
+ 2. Follow those instructions exactly to analyze the Airflow DAG
53
+ 3. Return the analysis results in the format specified by the MCP tool
54
+
55
+ **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
56
+ """
57
+
58
+ def field_derivation_instructions(name: str):
59
+ return f"""
60
+ You are the {name} Airflow lineage analysis agent.
61
+
62
+ **Your Task:** Analyze field mappings and transformations in the Airflow DAG.
63
+
64
+ **Process:**
65
+ 1. Call the airflow_lineage_field_derivation() MCP tool to get expert instructions
66
+ 2. Follow those instructions exactly to analyze field mappings
67
+ 3. Return the analysis results in the format specified by the MCP tool
68
+
69
+ **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
70
+ """
71
+
72
+ def operation_tracing_instructions(name: str):
73
+ return f"""
74
+ You are the {name} Airflow lineage analysis agent.
75
+
76
+ **Your Task:** Analyze logical operations and operators in the Airflow DAG.
77
+
78
+ **Process:**
79
+ 1. Call the airflow_lineage_operation_tracing() MCP tool to get expert instructions
80
+ 2. Follow those instructions exactly to analyze logical operations
81
+ 3. Return the analysis results in the format specified by the MCP tool
82
+
83
+ **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
84
+ """
85
+
86
+ def event_composer_instructions(name: str):
87
+ return f"""
88
+ You are the {name} Airflow lineage analysis agent.
89
+
90
+ **Your Task:** Compose OpenLineage events from the provided analysis data.
91
+
92
+ **Process:**
93
+ 1. Call the airflow_lineage_event_composer() MCP tool to get expert instructions
94
+ 2. Follow those instructions exactly to compose the OpenLineage event
95
+ 3. Return the event in the format specified by the MCP tool
96
+
97
+ **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
98
+ """
lf_algorithm/plugins/airflow_lineage_agent/lineage_agent.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+ from contextlib import AsyncExitStack
5
+ from agents import Agent, Tool, Runner, trace
6
+ from agents.mcp.server import MCPServerStdio
7
+ from typing import Dict, Any, Optional
8
+
9
+ from ...utils.tracers import log_trace_id
10
+ from ...plugins.airflow_lineage_agent.airflow_instructions import comprehensive_analysis_instructions
11
+ from ...plugins.airflow_lineage_agent.mcp_servers.mcp_params import airflow_mcp_server_params
12
+ from ...utils.file_utils import dump_json_record
13
+
14
+ # Get logger for this module
15
+ logger = logging.getLogger(__name__)
16
+
17
+ MAX_TURNS = 30 # Increased for comprehensive analysis
18
+
19
+
20
+ class AirflowLineageAgent:
21
+ """Plugin agent for Airflow lineage analysis"""
22
+
23
+ def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None):
24
+ self.agent_name = agent_name
25
+ self.model_name = model_name
26
+ self.source_code = source_code
27
+ self.get_model_func = get_model_func
28
+
29
+ async def create_agent(self, airflow_mcp_servers) -> Agent:
30
+ # Use the passed get_model_func or fall back to the centralized one
31
+ if self.get_model_func:
32
+ model = self.get_model_func(self.model_name)
33
+ else:
34
+ from ...utils import get_model
35
+ model = get_model(self.model_name)
36
+
37
+ agent = Agent(
38
+ name=self.agent_name,
39
+ instructions=comprehensive_analysis_instructions(self.agent_name),
40
+ model=model,
41
+ mcp_servers=airflow_mcp_servers,
42
+ )
43
+ return agent
44
+
45
+ async def run_agent(self, airflow_mcp_servers, source_code: str):
46
+ # Create single agent for comprehensive analysis
47
+ comprehensive_agent = await self.create_agent(airflow_mcp_servers)
48
+
49
+ # Run the complete analysis in one go
50
+ result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS)
51
+
52
+ # Return the final output
53
+ return dump_json_record(self.agent_name, result.final_output)
54
+
55
+ async def run_with_mcp_servers(self, source_code: str):
56
+ async with AsyncExitStack() as stack:
57
+ airflow_mcp_servers = [
58
+ await stack.enter_async_context(
59
+ MCPServerStdio(params, client_session_timeout_seconds=120)
60
+ )
61
+ for params in airflow_mcp_server_params
62
+ ]
63
+ return await self.run_agent(airflow_mcp_servers, source_code=source_code)
64
+
65
+ async def run_with_trace(self, source_code: str):
66
+ trace_name = f"{self.agent_name}-lineage-agent"
67
+ trace_id = log_trace_id(f"{self.agent_name.lower()}")
68
+ with trace(trace_name, trace_id=trace_id):
69
+ return await self.run_with_mcp_servers(source_code=source_code)
70
+
71
+ async def run(self):
72
+ try:
73
+ logger.info(f"Starting Airflow lineage analysis for {self.agent_name}")
74
+ result = await self.run_with_trace(self.source_code)
75
+ logger.info(f"Completed Airflow lineage analysis for {self.agent_name}")
76
+ return result
77
+ except Exception as e:
78
+ logger.error(f"Error running {self.agent_name}: {e}")
79
+ return {"error": str(e)}
80
+
81
+
82
+ # Plugin interface functions
83
+ def create_airflow_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> AirflowLineageAgent:
84
+ """Factory function to create a AirflowLineageAgent instance"""
85
+ return AirflowLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func)
86
+
87
+
88
+ def get_plugin_info() -> Dict[str, Any]:
89
+ """Return plugin metadata"""
90
+ return {
91
+ "name": "airflow-lineage-agent",
92
+ "description": "Airflow lineage analysis agent for parsing and analyzing Airflow queries",
93
+ "version": "1.0.0",
94
+ "author": "Ali Shamsaddinlou",
95
+ "agent_class": AirflowLineageAgent,
96
+ "factory_function": create_airflow_lineage_agent,
97
+ "supported_operations": ["lineage_analysis"],
98
+ }
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__init__.py ADDED
File without changes
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (221 Bytes). View file
 
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc ADDED
Binary file (515 Bytes). View file
 
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/__init__.py ADDED
File without changes
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ # Configure logging to suppress verbose output
4
+ logging.basicConfig(level=logging.WARNING)
5
+ logging.getLogger('mcp').setLevel(logging.WARNING)
6
+ logging.getLogger('mcp.server').setLevel(logging.WARNING)
7
+
8
+ from mcp.server.fastmcp import FastMCP
9
+ from typing import Dict, Any
10
+
11
+ mcp = FastMCP("lineage_airflow_server")
12
+
13
+ from templates import (airflow_lineage_syntax_analysis as syntax_analysis_template,
14
+ airflow_lineage_field_derivation as field_derivation_template,
15
+ airflow_lineage_operation_tracing as operation_tracing_template,
16
+ airflow_lineage_event_composer as event_composer_template)
17
+
18
+ @mcp.tool()
19
+ async def airflow_lineage_syntax_analysis() -> Dict[str, Any]:
20
+ """Airflow lineage structure and syntax decomposition expert"""
21
+ return {
22
+ "instructions": syntax_analysis_template(),
23
+ "version": "1.0.0",
24
+ "capabilities": ["dag_parsing", "task_extraction", "dependency_analysis"]
25
+ }
26
+
27
+ @mcp.tool()
28
+ async def airflow_lineage_field_derivation() -> Dict[str, Any]:
29
+ """Field mapping and field derivation expert"""
30
+ return {
31
+ "instructions": field_derivation_template(),
32
+ "version": "1.0.0",
33
+ "capabilities": ["field_mapping", "transformation_analysis", "column_lineage"]
34
+ }
35
+
36
+ @mcp.tool()
37
+ async def airflow_lineage_operation_tracing() -> Dict[str, Any]:
38
+ """Logical operator analysis and operation tracing expert"""
39
+ return {
40
+ "instructions": operation_tracing_template(),
41
+ "version": "1.0.0",
42
+ "capabilities": ["filter_analysis", "join_analysis", "aggregation_tracking"]
43
+ }
44
+
45
+ @mcp.tool()
46
+ async def airflow_lineage_event_composer() -> Dict[str, Any]:
47
+ """Event composition and aggregation expert"""
48
+ return {
49
+ "instructions": event_composer_template(),
50
+ "version": "1.0.0",
51
+ "capabilities": ["openlineage_generation", "event_composition", "metadata_aggregation"]
52
+ }
53
+
54
+ if __name__ == "__main__":
55
+ mcp.run(transport='stdio')
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/templates.py ADDED
@@ -0,0 +1,777 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+
4
+ def airflow_lineage_syntax_analysis():
5
+ return """
6
+ You are an Airflow DAG decomposition expert. Your task is to parse an Airflow DAG Python file and extract a clean breakdown of each task as logical units, including key operators, dependencies, and parameters.
7
+
8
+ Instructions:
9
+ - Extract complete Airflow tasks (not individual lines).
10
+ - Include task_id, operator name, and any important arguments (e.g., sql, bash_command, python_callable).
11
+ - Identify upstream/downstream task relationships.
12
+ - Do NOT include imports, default_args, or DAG definitions unless they affect task behavior directly.
13
+ - For TaskGroups or dynamic mapping, expand each logical unit clearly.
14
+
15
+ Output Format (JSON):
16
+ {
17
+ "tasks": [
18
+ {
19
+ "task_id": "<task_id>",
20
+ "operator": "<OperatorName>",
21
+ "params": {
22
+ "key1": "value1",
23
+ ...
24
+ },
25
+ "upstream": ["<task_id_1>", "<task_id_2>"],
26
+ "downstream": ["<task_id_3>"]
27
+ },
28
+ ...
29
+ ]
30
+ }
31
+
32
+ ---
33
+
34
+ Positive Example 1: Basic Bash DAG
35
+
36
+ Input:
37
+ from airflow import DAG
38
+ from airflow.operators.bash import BashOperator
39
+
40
+ with DAG('sample_dag') as dag:
41
+ t1 = BashOperator(task_id='start', bash_command='echo "start"')
42
+ t2 = BashOperator(task_id='process', bash_command='python run_job.py')
43
+ t3 = BashOperator(task_id='end', bash_command='echo "done"')
44
+ t1 >> t2 >> t3
45
+
46
+ Expected Output:
47
+ {
48
+ "tasks": [
49
+ {
50
+ "task_id": "start",
51
+ "operator": "BashOperator",
52
+ "params": { "bash_command": "echo \"start\"" },
53
+ "upstream": [],
54
+ "downstream": ["process"]
55
+ },
56
+ {
57
+ "task_id": "process",
58
+ "operator": "BashOperator",
59
+ "params": { "bash_command": "python run_job.py" },
60
+ "upstream": ["start"],
61
+ "downstream": ["end"]
62
+ },
63
+ {
64
+ "task_id": "end",
65
+ "operator": "BashOperator",
66
+ "params": { "bash_command": "echo \"done\"" },
67
+ "upstream": ["process"],
68
+ "downstream": []
69
+ }
70
+ ]
71
+ }
72
+
73
+ ---
74
+
75
+ Positive Example 2: PythonOperator DAG
76
+
77
+ Input:
78
+ from airflow import DAG
79
+ from airflow.operators.python import PythonOperator
80
+
81
+ def fetch_data():
82
+ return "data"
83
+
84
+ def transform_data():
85
+ return "transformed"
86
+
87
+ with DAG('etl_dag') as dag:
88
+ extract = PythonOperator(task_id='extract', python_callable=fetch_data)
89
+ transform = PythonOperator(task_id='transform', python_callable=transform_data)
90
+ extract >> transform
91
+
92
+ Expected Output:
93
+ {
94
+ "tasks": [
95
+ {
96
+ "task_id": "extract",
97
+ "operator": "PythonOperator",
98
+ "params": { "python_callable": "fetch_data" },
99
+ "upstream": [],
100
+ "downstream": ["transform"]
101
+ },
102
+ {
103
+ "task_id": "transform",
104
+ "operator": "PythonOperator",
105
+ "params": { "python_callable": "transform_data" },
106
+ "upstream": ["extract"],
107
+ "downstream": []
108
+ }
109
+ ]
110
+ }
111
+
112
+ ---
113
+
114
+ Positive Example 3: Branching with BranchPythonOperator
115
+
116
+ Input:
117
+ from airflow import DAG
118
+ from airflow.operators.python import PythonOperator, BranchPythonOperator
119
+ from airflow.operators.dummy import DummyOperator
120
+
121
+ def choose_path():
122
+ return "path_a"
123
+
124
+ with DAG('branch_dag') as dag:
125
+ start = DummyOperator(task_id='start')
126
+ branch = BranchPythonOperator(task_id='branch', python_callable=choose_path)
127
+ path_a = DummyOperator(task_id='path_a')
128
+ path_b = DummyOperator(task_id='path_b')
129
+ end = DummyOperator(task_id='end')
130
+
131
+ start >> branch >> [path_a, path_b]
132
+ [path_a, path_b] >> end
133
+
134
+ Expected Output:
135
+ {
136
+ "tasks": [
137
+ {
138
+ "task_id": "start",
139
+ "operator": "DummyOperator",
140
+ "params": {},
141
+ "upstream": [],
142
+ "downstream": ["branch"]
143
+ },
144
+ {
145
+ "task_id": "branch",
146
+ "operator": "BranchPythonOperator",
147
+ "params": { "python_callable": "choose_path" },
148
+ "upstream": ["start"],
149
+ "downstream": ["path_a", "path_b"]
150
+ },
151
+ {
152
+ "task_id": "path_a",
153
+ "operator": "DummyOperator",
154
+ "params": {},
155
+ "upstream": ["branch"],
156
+ "downstream": ["end"]
157
+ },
158
+ {
159
+ "task_id": "path_b",
160
+ "operator": "DummyOperator",
161
+ "params": {},
162
+ "upstream": ["branch"],
163
+ "downstream": ["end"]
164
+ },
165
+ {
166
+ "task_id": "end",
167
+ "operator": "DummyOperator",
168
+ "params": {},
169
+ "upstream": ["path_a", "path_b"],
170
+ "downstream": []
171
+ }
172
+ ]
173
+ }
174
+
175
+ ---
176
+
177
+ Positive Example 4: TaskGroup
178
+
179
+ Input:
180
+ from airflow import DAG
181
+ from airflow.operators.dummy import DummyOperator
182
+ from airflow.utils.task_group import TaskGroup
183
+
184
+ with DAG('grouped_dag') as dag:
185
+ start = DummyOperator(task_id='start')
186
+ end = DummyOperator(task_id='end')
187
+
188
+ with TaskGroup('transformations') as tg:
189
+ t1 = DummyOperator(task_id='clean')
190
+ t2 = DummyOperator(task_id='enrich')
191
+ t1 >> t2
192
+
193
+ start >> tg >> end
194
+
195
+ Expected Output:
196
+ {
197
+ "tasks": [
198
+ {
199
+ "task_id": "start",
200
+ "operator": "DummyOperator",
201
+ "params": {},
202
+ "upstream": [],
203
+ "downstream": ["transformations.clean"]
204
+ },
205
+ {
206
+ "task_id": "transformations.clean",
207
+ "operator": "DummyOperator",
208
+ "params": {},
209
+ "upstream": ["start"],
210
+ "downstream": ["transformations.enrich"]
211
+ },
212
+ {
213
+ "task_id": "transformations.enrich",
214
+ "operator": "DummyOperator",
215
+ "params": {},
216
+ "upstream": ["transformations.clean"],
217
+ "downstream": ["end"]
218
+ },
219
+ {
220
+ "task_id": "end",
221
+ "operator": "DummyOperator",
222
+ "params": {},
223
+ "upstream": ["transformations.enrich"],
224
+ "downstream": []
225
+ }
226
+ ]
227
+ }
228
+
229
+ ---
230
+
231
+ Positive Example 5: Dynamic Task Mapping with expand()
232
+
233
+ Input:
234
+ from airflow import DAG
235
+ from airflow.operators.python import PythonOperator
236
+
237
+ def greet(name):
238
+ print(f"Hello {name}")
239
+
240
+ with DAG('dynamic_dag') as dag:
241
+ greet_task = PythonOperator.partial(
242
+ task_id='greet',
243
+ python_callable=greet
244
+ ).expand(op_args=[["Alice", "Bob", "Charlie"]])
245
+
246
+ Expected Output:
247
+ {
248
+ "tasks": [
249
+ {
250
+ "task_id": "greet",
251
+ "operator": "PythonOperator.expand",
252
+ "params": {
253
+ "python_callable": "greet",
254
+ "op_args": ["Alice", "Bob", "Charlie"]
255
+ },
256
+ "upstream": [],
257
+ "downstream": []
258
+ }
259
+ ]
260
+ }
261
+
262
+ ---
263
+
264
+ Negative Example 1:
265
+
266
+ Input:
267
+ from airflow import DAG
268
+ from airflow.operators.python import PythonOperator
269
+
270
+ def fetch():
271
+ return "data"
272
+
273
+ with DAG('bad_dag') as dag:
274
+ task = PythonOperator(task_id='fetch', python_callable=fetch)
275
+
276
+ Incorrect Output:
277
+ {
278
+ "fetch": "PythonOperator"
279
+ }
280
+
281
+ Reason:
282
+ - The structure is invalid:
283
+ - It lacks required `"tasks"` array.
284
+ - It omits the `"params"` block.
285
+ - It does not specify upstream/downstream relationships.
286
+ """
287
+
288
+
289
+
290
+
291
+
292
+ def airflow_lineage_field_derivation():
293
+ return """
294
+ You are an Airflow task field mapping analysis expert. Your task is to analyze each task in an Airflow DAG and determine:
295
+
296
+ 1. What input data or fields it depends on.
297
+ 2. What transformations it performs.
298
+ 3. What output data or fields it produces.
299
+
300
+ Instructions:
301
+ - Focus on operators like BashOperator, PythonOperator, SQL-related operators, etc.
302
+ - Do NOT analyze Airflow scheduling logic or metadata unless it affects lineage.
303
+ - For PythonOperators, infer logic from the function if possible.
304
+ - For SQL or BashOperators, parse the SQL or script if included.
305
+ - Your job is to extract lineage-relevant inputs, transformations, and outputs.
306
+ - look into all the operators and their parameters, and infer the inputs, outputs, and transformations.
307
+ - if the operator is a PythonOperator, look into the function and infer the inputs, outputs, and transformations.
308
+ - if the operator is a SQLOperator, look into the SQL and infer the inputs, outputs, and transformations.
309
+ - if the operator is a BashOperator, look into the Bash command and infer the inputs, outputs, and transformations.
310
+ - if the operator is a PostgresOperator, look into the SQL and infer the inputs, outputs, and transformations.
311
+ - if the operator is a MySQLOperator, look into the SQL and infer the inputs, outputs, and transformations.
312
+ - if the operator is a OracleOperator, look into the SQL and infer the inputs, outputs, and transformations.
313
+ - if the operator is a SparkOperator, look into the Spark code and infer the inputs, outputs, and transformations.
314
+ - if the operator is a HiveOperator, look into the Hive code and infer the inputs, outputs, and transformations.
315
+ - if the operator is a KafkaOperator, look into the Kafka code and infer the inputs, outputs, and transformations.
316
+ - if the operator is a S3Operator, look into the S3 code and infer the inputs, outputs, and transformations.
317
+ - if the operator is a GCSOperator, look into the GCS code and infer the inputs, outputs, and transformations.
318
+ - if the operator is a FTPOperator, look into the FTP code and infer the inputs, outputs, and transformations.
319
+ - if the operator is a SFTPOperator, look into the SFTP code and infer the inputs, outputs, and transformations.
320
+
321
+ Output Format:
322
+ [
323
+ { "output_fields": [ {
324
+ "namespace": "<INPUT_NAMESPACE>",
325
+ "name": "<INPUT_NAME>",
326
+ "field": "<INPUT_FIELD_NAME>",
327
+ "transformation": "<description of logic>"
328
+ } ] },
329
+ ...
330
+ ]
331
+
332
+
333
+
334
+ Positive Example :
335
+
336
+ Input:
337
+ from airflow import DAG
338
+ from airflow.operators.python import PythonOperator
339
+ from datetime import datetime
340
+ import pandas as pd
341
+ import numpy as np
342
+ import shutil
343
+
344
+ def fetch_raw_data():
345
+ # Simulate a data pull or raw copy
346
+ shutil.copy('/data/source/raw_customers.csv', '/data/input/customers.csv')
347
+
348
+ def transform_customer_data():
349
+ df = pd.read_csv('/data/input/customers.csv')
350
+
351
+ df['first_name'] = df['first_name'].str.strip().str.title()
352
+ df['last_name'] = df['last_name'].str.strip().str.title()
353
+ df['full_name'] = df['first_name'] + ' ' + df['last_name']
354
+
355
+ df['birthdate'] = pd.to_datetime(df['birthdate'])
356
+ df['age'] = (pd.Timestamp('today') - df['birthdate']).dt.days // 365
357
+
358
+ df['age_group'] = np.where(df['age'] >= 60, 'Senior',
359
+ np.where(df['age'] >= 30, 'Adult', 'Young'))
360
+
361
+ df = df[df['email'].notnull()]
362
+
363
+ df.to_csv('/data/output/cleaned_customers.csv', index=False)
364
+
365
+ def load_to_warehouse():
366
+ # Load cleaned data to customers_1 table in database
367
+ df = pd.read_csv('/data/output/cleaned_customers.csv')
368
+
369
+ # Get database connection
370
+ pg_hook = PostgresHook(postgres_conn_id='warehouse_connection')
371
+ engine = pg_hook.get_sqlalchemy_engine()
372
+
373
+ # Write to customers_1 table
374
+ df.to_sql('customers_1', engine, if_exists='replace', index=False)
375
+
376
+ print(f"Successfully loaded {len(df)} records to customers_1 table")
377
+
378
+ default_args = {
379
+ 'start_date': datetime(2025, 8, 1),
380
+ }
381
+
382
+ with DAG(
383
+ dag_id='customer_etl_pipeline_extended',
384
+ default_args=default_args,
385
+ schedule_interval='@daily',
386
+ catchup=False,
387
+ tags=['etl', 'example']
388
+ ) as dag:
389
+
390
+ ff = PythonOperator(
391
+ task_id='fetch_data',
392
+ python_callable=fetch_raw_data
393
+ )
394
+
395
+ tt = PythonOperator(
396
+ task_id='transform_and_clean',
397
+ python_callable=transform_customer_data
398
+ )
399
+
400
+ ll = PythonOperator(
401
+ task_id='load_to_warehouse',
402
+ python_callable=load_to_warehouse
403
+ )
404
+
405
+ ff >> tt >> ll
406
+
407
+ Expected Output:
408
+ {
409
+ "output_fields": [
410
+ {
411
+ "namespace": "default",
412
+ "name": "customers.csv",
413
+ "field": "first_name",
414
+ "transformation": "Strip and title case"
415
+ },
416
+ {
417
+ "namespace": "default",
418
+ "name": "customers.csv",
419
+ "field": "last_name",
420
+ "transformation": "Strip and title case"
421
+ },
422
+ {
423
+ "namespace": "default",
424
+ "name": "customers.csv",
425
+ "field": "full_name",
426
+ "transformation": "Concatenation with space"
427
+ },
428
+ {
429
+ "namespace": "default",
430
+ "name": "customers.csv",
431
+ "field": "birthdate",
432
+ "transformation": "Convert to datetime"
433
+ },
434
+ {
435
+ "namespace": "default",
436
+ "name": "customers.csv",
437
+ "field": "age",
438
+ "transformation": "Calculate age"
439
+ },
440
+ {
441
+ "namespace": "default",
442
+ "name": "customers.csv",
443
+ "field": "age_group",
444
+ "transformation": "Group by age"
445
+ },
446
+ {
447
+ "namespace": "default",
448
+ "name": "customers.csv",
449
+ "field": "email",
450
+ "transformation": "Remove nulls"
451
+ }
452
+
453
+ ],
454
+ }
455
+
456
+
457
+
458
+ """
459
+
460
+
461
+
462
+ def airflow_lineage_operation_tracing():
463
+ return """
464
+ You are a logical operator analysis expert for Airflow DAGs. Your task is to inspect each task’s logic and extract the logical operations applied to data fields. This includes:
465
+
466
+ - Filters
467
+ - Joins (if any SQL is embedded or implied)
468
+ - Group by / Having
469
+ - Order by
470
+ - Other conditional logic (e.g., CASE, EXISTS, .apply filters)
471
+
472
+ Instructions:
473
+ - Only include fields involved in logic, not all fields.
474
+ - Tasks using Python callables or SQL should be parsed and analyzed.
475
+ - Bash commands are only considered if they invoke Python/SQL/CLI logic that performs data filtering or selection.
476
+
477
+ Output Format:
478
+ {
479
+ "logical_operators": [
480
+ {
481
+ "task_id": "<task_id>",
482
+ "source_fields": ["<field1>", "<field2>", ...],
483
+ "logical_operators": {
484
+ "filters": ["..."],
485
+ "joins": ["..."],
486
+ "group_by": ["..."],
487
+ "having": ["..."],
488
+ "order_by": ["..."],
489
+ "other": ["..."]
490
+ }
491
+ }
492
+ ]
493
+ }
494
+
495
+ ---
496
+
497
+ Positive Example 1:
498
+
499
+ Input:
500
+ from airflow.operators.postgres_operator import PostgresOperator
501
+
502
+ t1 = PostgresOperator(
503
+ task_id='filter_active_users',
504
+ sql='SELECT id, name FROM users WHERE status = \'active\' ORDER BY name',
505
+ postgres_conn_id='analytics_db'
506
+ )
507
+
508
+ Expected Output:
509
+ {
510
+ "logical_operators": [
511
+ {
512
+ "task_id": "filter_active_users",
513
+ "source_fields": ["status", "name"],
514
+ "logical_operators": {
515
+ "filters": ["status = 'active'"],
516
+ "order_by": ["name"]
517
+ }
518
+ }
519
+ ]
520
+ }
521
+
522
+ ---
523
+
524
+ Positive Example 2:
525
+
526
+ Input:
527
+ from airflow.operators.python import PythonOperator
528
+
529
+ def filter_sales():
530
+ import pandas as pd
531
+ df = pd.read_csv("sales.csv")
532
+ filtered = df[df["region"] == "EU"]
533
+ result = filtered[filtered["amount"] > 1000]
534
+ return result
535
+
536
+ t2 = PythonOperator(
537
+ task_id='filter_sales',
538
+ python_callable=filter_sales
539
+ )
540
+
541
+ Expected Output:
542
+ {
543
+ "logical_operators": [
544
+ {
545
+ "task_id": "filter_sales",
546
+ "source_fields": ["region", "amount"],
547
+ "logical_operators": {
548
+ "filters": ["df['region'] == 'EU'", "filtered['amount'] > 1000"]
549
+ }
550
+ }
551
+ ]
552
+ }
553
+
554
+ ---
555
+
556
+ Negative Example 1:
557
+
558
+ Input:
559
+ from airflow.operators.bash import BashOperator
560
+
561
+ t3 = BashOperator(
562
+ task_id='run_model',
563
+ bash_command='python model.py'
564
+ )
565
+
566
+ Incorrect Output:
567
+ {
568
+ "logical_operators": [
569
+ {
570
+ "task_id": "run_model",
571
+ "source_fields": ["model"],
572
+ "logical_operators": {
573
+ "filters": ["--use-gpu"]
574
+ }
575
+ }
576
+ ]
577
+ }
578
+
579
+ Reason:
580
+ - BashOperator with a generic script path provides no visible logical operations on data.
581
+ - There is no SQL or Python code to analyze for filtering, joining, or grouping.
582
+ - No valid field-level logic can be inferred.
583
+ """
584
+
585
+
586
+
587
+
588
+ def airflow_lineage_event_composer():
589
+ return """
590
+ You are an OpenLineage lineage generation expert for Apache Airflow DAGs.
591
+
592
+ Your job is to take parsed DAG tasks, field mappings, and logical operations, and generate a **single OpenLineage event JSON** representing full lineage across the DAG.
593
+
594
+ ---
595
+
596
+ ### You will receive:
597
+
598
+ 1. **DAG Task Breakdown** (with dependencies, task_ids, operator type, params)
599
+
600
+ 2. **Field Mappings** per task:
601
+ [
602
+ {
603
+ "task_id": "<task_id>",
604
+ "inputs": [...],
605
+ "outputs": [...],
606
+ "transformations": [...]
607
+ }
608
+ ]
609
+
610
+ 3. **Logical Operators** per task:
611
+ [
612
+ {
613
+ "task_id": "<task_id>",
614
+ "source_fields": [...],
615
+ "logical_operators": {
616
+ "filters": [...],
617
+ "joins": [...],
618
+ "group_by": [...],
619
+ "having": [...],
620
+ "order_by": [...],
621
+ "other": [...]
622
+ }
623
+ }
624
+ ]
625
+
626
+ ---
627
+
628
+ ### Your Task:
629
+
630
+ Generate **one OpenLineage event JSON** that captures the full end-to-end data flow and transformations in the DAG.
631
+
632
+ Strictly follow the format below:
633
+
634
+ - Do NOT rename, flatten, or restructure any fields or keys.
635
+ - Output only the final OpenLineage JSON — no extra text, comments, or explanation.
636
+ - `inputs` should represent input **datasets**, not individual fields.
637
+ 4. Based on following examples generate <INPUT_NAMESPACE>, <INPUT_NAME>, <OUTPUT_NAMESPACE>, <OUTPUT_NAME> for Apache Airflow DAGs and tasks (file-based sources/targets, SQL-based operators, cloud storage operators, in-memory variables):
638
+
639
+ Airflow PythonOperator (reads local file)
640
+ def _read_file():
641
+ with open("/data/raw/customers.csv") as f:
642
+ return f.read()
643
+ Expected:
644
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
645
+ <INPUT_NAME> or <OUTPUT_NAME>: file./data/raw/customers.csv
646
+
647
+ Airflow PythonOperator (writes local file)
648
+ def _write_file(data):
649
+ with open("/data/curated/customers_curated.csv", "w") as f:
650
+ f.write(data)
651
+ Expected:
652
+ <OUTPUT_NAMESPACE>: default
653
+ <OUTPUT_NAME>: file./data/curated/customers_curated.csv
654
+
655
+ Airflow BashOperator (reads S3 file)
656
+ bash_command="aws s3 cp s3://datalake/raw/events/2025-08-01.json -"
657
+ Expected:
658
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
659
+ <INPUT_NAME> or <OUTPUT_NAME>: s3./datalake/raw/events/2025-08-01.json
660
+
661
+ Airflow BashOperator (writes S3 file)
662
+ bash_command="aws s3 cp /tmp/output.json s3://warehouse/gold/output.json"
663
+ Expected:
664
+ <OUTPUT_NAMESPACE>: default
665
+ <OUTPUT_NAME>: s3./warehouse/gold/output.json
666
+
667
+ Airflow SQL operators (PostgresOperator with schema.table)
668
+ sql="SELECT * FROM analytics.orders"
669
+ Expected:
670
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
671
+ <INPUT_NAME> or <OUTPUT_NAME>: analytics.orders
672
+
673
+ Airflow SQL operators (BigQueryOperator with project.dataset.table)
674
+ sql="SELECT id FROM project123.dataset456.customers"
675
+ Expected:
676
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: project123
677
+ <INPUT_NAME> or <OUTPUT_NAME>: dataset456.customers
678
+
679
+ Airflow S3ToRedshiftOperator
680
+ s3_bucket="datalake", s3_key="bronze/sales.csv", table="analytics.sales"
681
+ Expected:
682
+ <INPUT_NAMESPACE>: default
683
+ <INPUT_NAME>: s3./datalake/bronze/sales.csv
684
+ <OUTPUT_NAMESPACE>: default
685
+ <OUTPUT_NAME>: analytics.sales
686
+
687
+ Airflow LocalFilesystemToGCSOperator
688
+ src="/tmp/data.json", dst="bronze/data.json"
689
+ Expected:
690
+ <INPUT_NAMESPACE>: default
691
+ <INPUT_NAME>: file./tmp/data.json
692
+ <OUTPUT_NAMESPACE>: default
693
+ <OUTPUT_NAME>: gs./bronze/data.json
694
+
695
+ Airflow in-memory XCom variable
696
+ ti.xcom_push(key="intermediate_data", value=[1,2,3])
697
+ Expected:
698
+ <OUTPUT_NAMESPACE>: temp
699
+ <OUTPUT_NAME>: intermediate_data
700
+
701
+ Airflow XCom read
702
+ data = ti.xcom_pull(key="intermediate_data")
703
+ Expected:
704
+ <INPUT_NAMESPACE>: temp
705
+ <INPUT_NAME>: intermediate_data
706
+
707
+ Notes:
708
+ - Use scheme prefixes for path-like sources/targets:
709
+ file./absolute/or/relative/path
710
+ s3./bucket/key
711
+ gs./bucket/key
712
+ abfs./container/path
713
+ - For in-memory XComs or Python variables, use:
714
+ <NAMESPACE> = temp
715
+ <NAME> = <variable_or_key_name>
716
+ - For SQL-based operators:
717
+ BigQuery: namespace = <project>, name = <dataset.table>
718
+ Postgres/MySQL: namespace = default, name = <schema.table>
719
+ SQL Server: namespace = <database>, name = <schema.table>
720
+ - Wherever you can't find information for <STORAGE_LAYER>, <FILE_FORMAT>, <DATASET_TYPE>, <SUB_TYPE>, <LIFECYCLE>, <OWNER_NAME>, <OWNER_TYPE>, <SUBTYPE>, <DESCRIPTION> then write "NA".
721
+ - Very important: Your output must follow exactly the specified JSON structure — do not output explanations, comments, or anything else.
722
+ - very very very important: Your output must follow **exactly** this JSON structure — do not output explanations, comments, or anything else.
723
+
724
+ ---
725
+
726
+ ### Required Output Format (Example):
727
+ {
728
+ "inputs": [
729
+ {
730
+ "namespace": "<INPUT_NAMESPACE>",
731
+ "name": "<INPUT_NAME>",
732
+ "facets": {
733
+ "schema": {
734
+ "fields": [
735
+ {
736
+ "name": "<FIELD_NAME>",
737
+ "type": "<FIELD_TYPE>",
738
+ "description": "<FIELD_DESCRIPTION>"
739
+ }
740
+ ]
741
+ }
742
+ }
743
+ }
744
+ ],
745
+ "outputs": [
746
+ {
747
+ "namespace": "<OUTPUT_NAMESPACE>",
748
+ "name": "<OUTPUT_NAME>",
749
+ "facets": {
750
+ "columnLineage": {
751
+ "fields": {
752
+ "<OUTPUT_FIELD_NAME>": {
753
+ "inputFields": [
754
+ {
755
+ "namespace": "<INPUT_NAMESPACE>",
756
+ "name": "<INPUT_NAME>",
757
+ "field": "<INPUT_FIELD_NAME>",
758
+ "transformations": [
759
+ {
760
+ "type": "<TRANSFORMATION_TYPE>",
761
+ "subtype": "<SUBTYPE>",
762
+ "description": "<DESCRIPTION>",
763
+ "masking": false
764
+ }
765
+ ]
766
+ }
767
+ ]
768
+ }
769
+ }
770
+ }
771
+ }
772
+ }
773
+ ]
774
+ }
775
+
776
+ 4. Return only results in above mentioned json schema format. do not add any text.
777
+ """
lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_params.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv(override=True)
5
+
6
+ # python_lineage_agent mcp server params
7
+ airflow_mcp_server_params = [
8
+ {"command": "python", "args": ["lf_algorithm/plugins/airflow_lineage_agent/mcp_servers/mcp_airflow_lineage/lineage_airflow_server.py"]},
9
+ ]
lf_algorithm/plugins/java_lineage_agent/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
lf_algorithm/plugins/java_lineage_agent/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (206 Bytes). View file
 
lf_algorithm/plugins/java_lineage_agent/__pycache__/java_instructions.cpython-313.pyc ADDED
Binary file (5.15 kB). View file
 
lf_algorithm/plugins/java_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc ADDED
Binary file (5.84 kB). View file
 
lf_algorithm/plugins/java_lineage_agent/java_instructions.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def comprehensive_analysis_instructions(name: str):
2
+ return f"""
3
+ You are the {name} Java lineage analysis agent.
4
+
5
+ **Your Task:** Perform complete Java code lineage analysis in a single comprehensive process.
6
+
7
+ **Complete Analysis Process:**
8
+
9
+ **Step 1: Syntax Analysis**
10
+ 1. Call the java_lineage_syntax_analysis() MCP tool to get expert instructions
11
+ 2. Follow those instructions exactly to analyze the Java code structure
12
+ 3. Store the syntax analysis results for use in subsequent steps
13
+
14
+ **Step 2: Field Derivation**
15
+ 1. Call the java_lineage_field_derivation() MCP tool to get expert instructions
16
+ 2. Use the syntax analysis results from Step 1 to inform your field mapping analysis
17
+ 3. Follow the MCP tool instructions exactly to analyze field mappings and transformations
18
+ 4. Store the field derivation results
19
+
20
+ **Step 3: Operation Tracing**
21
+ 1. Call the java_lineage_operation_tracing() MCP tool to get expert instructions
22
+ 2. Use the syntax analysis results from Step 1 to inform your operation analysis
23
+ 3. Follow the MCP tool instructions exactly to analyze logical operations and operators
24
+ 4. Store the operation tracing results
25
+
26
+ **Step 4: Event Composition**
27
+ 1. Call the java_lineage_event_composer() MCP tool to get expert instructions
28
+ 2. Combine all previous analysis results (syntax, field derivation, operation tracing)
29
+ 3. Follow the MCP tool instructions exactly to compose the final OpenLineage event
30
+ 4. Return the complete OpenLineage event
31
+
32
+ **Important Guidelines:**
33
+ - Each MCP tool contains detailed instructions, examples, and output format requirements
34
+ - Follow the MCP tool instructions precisely for each step
35
+ - Maintain context between steps - use results from earlier steps to inform later analysis
36
+ - Ensure the final output is a complete, properly formatted OpenLineage event
37
+ - If any step fails, provide clear error information and stop the process
38
+
39
+ **Workflow Summary:**
40
+ Syntax Analysis → Field Derivation → Operation Tracing → Event Composition → Final Output
41
+ """
42
+
43
+ # Keep the individual instructions for backward compatibility if needed
44
+ def syntax_analysis_instructions(name: str):
45
+ return f"""
46
+ You are the {name} Java lineage analysis agent.
47
+
48
+ **Your Task:** Analyze the provided Java code for syntax structure.
49
+
50
+ **Process:**
51
+ 1. Call the java_lineage_syntax_analysis() MCP tool to get expert instructions
52
+ 2. Follow those instructions exactly to analyze the Java code
53
+ 3. Return the analysis results in the format specified by the MCP tool
54
+
55
+ **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
56
+ """
57
+
58
+ def field_derivation_instructions(name: str):
59
+ return f"""
60
+ You are the {name} Java lineage analysis agent.
61
+
62
+ **Your Task:** Analyze field mappings and transformations in the Java code.
63
+
64
+ **Process:**
65
+ 1. Call the java_lineage_field_derivation() MCP tool to get expert instructions
66
+ 2. Follow those instructions exactly to analyze field mappings
67
+ 3. Return the analysis results in the format specified by the MCP tool
68
+
69
+ **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
70
+ """
71
+
72
+ def operation_tracing_instructions(name: str):
73
+ return f"""
74
+ You are the {name} Java lineage analysis agent.
75
+
76
+ **Your Task:** Analyze logical operations and operators in the Java code.
77
+
78
+ **Process:**
79
+ 1. Call the java_lineage_operation_tracing() MCP tool to get expert instructions
80
+ 2. Follow those instructions exactly to analyze logical operations
81
+ 3. Return the analysis results in the format specified by the MCP tool
82
+
83
+ **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
84
+ """
85
+
86
+ def event_composer_instructions(name: str):
87
+ return f"""
88
+ You are the {name} Java lineage analysis agent.
89
+
90
+ **Your Task:** Compose OpenLineage events from the provided analysis data.
91
+
92
+ **Process:**
93
+ 1. Call the java_lineage_event_composer() MCP tool to get expert instructions
94
+ 2. Follow those instructions exactly to compose the OpenLineage event
95
+ 3. Return the event in the format specified by the MCP tool
96
+
97
+ **Important:** The MCP tool contains all the detailed instructions, examples, and output format requirements. Follow them precisely.
98
+ """
lf_algorithm/plugins/java_lineage_agent/lineage_agent.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+ from contextlib import AsyncExitStack
5
+ from agents import Agent, Tool, Runner, trace
6
+ from agents.mcp.server import MCPServerStdio
7
+ from typing import Dict, Any, Optional
8
+
9
+ from ...utils.tracers import log_trace_id
10
+ from ...plugins.java_lineage_agent.java_instructions import comprehensive_analysis_instructions
11
+ from ...plugins.java_lineage_agent.mcp_servers.mcp_params import java_mcp_server_params
12
+ from ...utils.file_utils import dump_json_record
13
+
14
+ # Get logger for this module
15
+ logger = logging.getLogger(__name__)
16
+
17
+ MAX_TURNS = 30 # Increased for comprehensive analysis
18
+
19
+
20
+ class JavaLineageAgent:
21
+ """Plugin agent for Java lineage analysis"""
22
+
23
+ def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None):
24
+ self.agent_name = agent_name
25
+ self.model_name = model_name
26
+ self.source_code = source_code
27
+ self.get_model_func = get_model_func
28
+
29
+ async def create_agent(self, java_mcp_servers) -> Agent:
30
+ # Use the passed get_model_func or fall back to the centralized one
31
+ if self.get_model_func:
32
+ model = self.get_model_func(self.model_name)
33
+ else:
34
+ from ...utils import get_model
35
+ model = get_model(self.model_name)
36
+
37
+ agent = Agent(
38
+ name=self.agent_name,
39
+ instructions=comprehensive_analysis_instructions(self.agent_name),
40
+ model=model,
41
+ mcp_servers=java_mcp_servers,
42
+ )
43
+ return agent
44
+
45
+ async def run_agent(self, java_mcp_servers, source_code: str):
46
+ # Create single agent for comprehensive analysis
47
+ comprehensive_agent = await self.create_agent(java_mcp_servers)
48
+
49
+ # Run the complete analysis in one go
50
+ result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS)
51
+
52
+ # Return the final output
53
+ return dump_json_record(self.agent_name, result.final_output)
54
+
55
+ async def run_with_mcp_servers(self, source_code: str):
56
+ async with AsyncExitStack() as stack:
57
+ java_mcp_servers = [
58
+ await stack.enter_async_context(
59
+ MCPServerStdio(params, client_session_timeout_seconds=120)
60
+ )
61
+ for params in java_mcp_server_params
62
+ ]
63
+ return await self.run_agent(java_mcp_servers, source_code=source_code)
64
+
65
+ async def run_with_trace(self, source_code: str):
66
+ trace_name = f"{self.agent_name}-lineage-agent"
67
+ trace_id = log_trace_id(f"{self.agent_name.lower()}")
68
+ with trace(trace_name, trace_id=trace_id):
69
+ return await self.run_with_mcp_servers(source_code=source_code)
70
+
71
+ async def run(self):
72
+ try:
73
+ logger.info(f"Starting Java lineage analysis for {self.agent_name}")
74
+ result = await self.run_with_trace(self.source_code)
75
+ logger.info(f"Completed Java lineage analysis for {self.agent_name}")
76
+ return result
77
+ except Exception as e:
78
+ logger.error(f"Error running {self.agent_name}: {e}")
79
+ return {"error": str(e)}
80
+
81
+
82
+ # Plugin interface functions
83
+ def create_java_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> JavaLineageAgent:
84
+ """Factory function to create a JavaLineageAgent instance"""
85
+ return JavaLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func)
86
+
87
+
88
+ def get_plugin_info() -> Dict[str, Any]:
89
+ """Return plugin metadata"""
90
+ return {
91
+ "name": "java-lineage-agent",
92
+ "description": "Java lineage analysis agent for parsing and analyzing Java queries",
93
+ "version": "1.0.0",
94
+ "author": "Ali Shamsaddinlou",
95
+ "agent_class": JavaLineageAgent,
96
+ "factory_function": create_java_lineage_agent,
97
+ }
lf_algorithm/plugins/java_lineage_agent/mcp_servers/__init__.py ADDED
File without changes
lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (218 Bytes). View file
 
lf_algorithm/plugins/java_lineage_agent/mcp_servers/__pycache__/mcp_params.cpython-313.pyc ADDED
Binary file (500 Bytes). View file
 
lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/__init__.py ADDED
File without changes
lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ # Configure logging to suppress verbose output
4
+ logging.basicConfig(level=logging.WARNING)
5
+ logging.getLogger('mcp').setLevel(logging.WARNING)
6
+ logging.getLogger('mcp.server').setLevel(logging.WARNING)
7
+
8
+ from mcp.server.fastmcp import FastMCP
9
+ from typing import Dict, Any
10
+
11
+ mcp = FastMCP("lineage_java_server")
12
+
13
+ from templates import (java_lineage_syntax_analysis as syntax_analysis_template,
14
+ java_lineage_field_derivation as field_derivation_template,
15
+ java_lineage_operation_tracing as operation_tracing_template,
16
+ java_lineage_event_composer as event_composer_template)
17
+
18
+ @mcp.tool()
19
+ async def java_lineage_syntax_analysis() -> Dict[str, Any]:
20
+ """Java lineage structure and syntax decomposition expert"""
21
+ return {
22
+ "instructions": syntax_analysis_template(),
23
+ "version": "1.0.0",
24
+ "capabilities": ["java_parsing", "method_extraction", "block_analysis"]
25
+ }
26
+
27
+ @mcp.tool()
28
+ async def java_lineage_field_derivation() -> Dict[str, Any]:
29
+ """Field mapping and field derivation expert"""
30
+ return {
31
+ "instructions": field_derivation_template(),
32
+ "version": "1.0.0",
33
+ "capabilities": ["field_mapping", "transformation_analysis", "column_lineage"]
34
+ }
35
+
36
+ @mcp.tool()
37
+ async def java_lineage_operation_tracing() -> Dict[str, Any]:
38
+ """Logical operator analysis and operation tracing expert"""
39
+ return {
40
+ "instructions": operation_tracing_template(),
41
+ "version": "1.0.0",
42
+ "capabilities": ["filter_analysis", "stream_analysis", "aggregation_tracking"]
43
+ }
44
+
45
+ @mcp.tool()
46
+ async def java_lineage_event_composer() -> Dict[str, Any]:
47
+ """Event composition and aggregation expert"""
48
+ return {
49
+ "instructions": event_composer_template(),
50
+ "version": "1.0.0",
51
+ "capabilities": ["openlineage_generation", "event_composition", "metadata_aggregation"]
52
+ }
53
+
54
+ if __name__ == "__main__":
55
+ mcp.run(transport='stdio')
lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/templates.py ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+
4
+ def java_lineage_syntax_analysis():
5
+ return """
6
+ You are a Java data pipeline decomposition expert. Your task is to analyze complex Java source files and extract discrete, logical transformation blocks. These include data source initialization, filtering, transformation, aggregation, feature derivation, and any computation logic. Each extracted block should be meaningful, self-contained, and independently interpretable.
7
+
8
+ Instructions:
9
+ - Extract: Complete transformation steps, including data source initialization, filtering, mapping, joining, grouping, calculating, or any pre/postprocessing blocks.
10
+ - Do NOT extract single lines unless they represent a standalone logical operation or setup (e.g., reading a file, defining a method, or a full map/filter chain).
11
+ - Group tightly related chained operations (e.g., Java Stream chains) into a single transformation unit.
12
+ - Preserve entire method definitions or reusable transformation blocks intact.
13
+ - Comment lines (// ...) can help guide naming but should not be extracted on their own.
14
+
15
+ Output Format (JSON):
16
+ {
17
+ "sp1": { "name": "<descriptive_name>", "code": "<valid_java_code_block>" },
18
+ "sp2": { "name": "<descriptive_name>", "code": "<valid_java_code_block>" },
19
+ ...
20
+ }
21
+
22
+ ---
23
+
24
+ Positive Example 1:
25
+
26
+ Input Java:
27
+ import java.nio.file.*;
28
+ import java.util.*;
29
+ import java.util.stream.*;
30
+
31
+ public class DataProcessor {
32
+ public static void main(String[] args) throws Exception {
33
+ // Load data
34
+ List<String> lines = Files.readAllLines(Paths.get("sales.csv"));
35
+
36
+ // Parse and clean data
37
+ List<Sale> sales = lines.stream()
38
+ .skip(1)
39
+ .map(Sale::fromCsv)
40
+ .filter(s -> s.getPrice() != null)
41
+ .collect(Collectors.toList());
42
+
43
+ // Compute revenue
44
+ for (Sale s : sales) {
45
+ s.setRevenue(s.getPrice() * s.getQuantity());
46
+ }
47
+
48
+ // Filter high revenue
49
+ List<Sale> highRevenue = sales.stream()
50
+ .filter(s -> s.getRevenue() > 1000)
51
+ .collect(Collectors.toList());
52
+ }
53
+ }
54
+
55
+ Expected Output:
56
+ {
57
+ "sp1": {
58
+ "name": "load_sales_data_from_csv",
59
+ "code": "List<String> lines = Files.readAllLines(Paths.get(\"sales.csv\"));"
60
+ },
61
+ "sp2": {
62
+ "name": "parse_and_clean_sales_data",
63
+ "code": "List<Sale> sales = lines.stream()\n .skip(1)\n .map(Sale::fromCsv)\n .filter(s -> s.getPrice() != null)\n .collect(Collectors.toList());"
64
+ },
65
+ "sp3": {
66
+ "name": "compute_revenue_per_sale",
67
+ "code": "for (Sale s : sales) {\n s.setRevenue(s.getPrice() * s.getQuantity());\n}"
68
+ },
69
+ "sp4": {
70
+ "name": "filter_high_revenue_sales",
71
+ "code": "List<Sale> highRevenue = sales.stream()\n .filter(s -> s.getRevenue() > 1000)\n .collect(Collectors.toList());"
72
+ }
73
+ }
74
+
75
+ ---
76
+
77
+ Positive Example 2 (with method definition):
78
+
79
+ Input Java:
80
+ public static List<Double> normalize(List<Double> values) {
81
+ double mean = values.stream().mapToDouble(v -> v).average().orElse(0.0);
82
+ double std = Math.sqrt(values.stream().mapToDouble(v -> Math.pow(v - mean, 2)).average().orElse(0.0));
83
+ return values.stream().map(v -> (v - mean) / std).collect(Collectors.toList());
84
+ }
85
+
86
+ // In main
87
+ List<Double> incomes = loadIncomeData(); // Assume loaded
88
+ List<Double> normalized = normalize(incomes);
89
+
90
+ Expected Output:
91
+ {
92
+ "sp1": {
93
+ "name": "define_normalize_method",
94
+ "code": "public static List<Double> normalize(List<Double> values) {\n double mean = values.stream().mapToDouble(v -> v).average().orElse(0.0);\n double std = Math.sqrt(values.stream().mapToDouble(v -> Math.pow(v - mean, 2)).average().orElse(0.0));\n return values.stream().map(v -> (v - mean) / std).collect(Collectors.toList());\n}"
95
+ },
96
+ "sp2": {
97
+ "name": "load_income_data",
98
+ "code": "List<Double> incomes = loadIncomeData();"
99
+ },
100
+ "sp3": {
101
+ "name": "normalize_income_values",
102
+ "code": "List<Double> normalized = normalize(incomes);"
103
+ }
104
+ }
105
+
106
+ ---
107
+
108
+ Negative Example (Too granular):
109
+
110
+ {
111
+ "sp1": { "name": "skip_header", "code": "lines.stream().skip(1)" },
112
+ "sp2": { "name": "filter_null_price", "code": ".filter(s -> s.getPrice() != null)" }
113
+ }
114
+
115
+ Reason: These operations are tightly chained and should be grouped into a cohesive transformation step.
116
+ """
117
+
118
+
119
+
120
+
121
+ def java_lineage_field_derivation():
122
+ return """
123
+ You are a Java field mapping analysis expert. Given a Java code snippet (typically part of a data transformation pipeline), your job is to extract and explain how each output field or variable is derived. For each, identify:
124
+
125
+ 1. The **source field(s)** or variables it depends on
126
+ 2. The **transformation logic** applied (e.g., arithmetic operation, aggregation, string manipulation, method call, etc.)
127
+
128
+ Output Format:
129
+ {
130
+ "output_fields": [
131
+ {
132
+ "namespace": "<INPUT_NAMESPACE>",
133
+ "name": "<INPUT_NAME>",
134
+ "field": "<INPUT_FIELD_NAME>",
135
+ "transformation": "<description of logic>"
136
+ },
137
+ ...
138
+ ]
139
+ }
140
+
141
+ ---
142
+
143
+ Positive Example 1:
144
+
145
+ Input Java:
146
+ read from table employee
147
+ Employee employee = new Employee();
148
+ employee.setAnnualSalary(employee.getMonthlySalary() * 12);
149
+
150
+ Expected Output:
151
+ {
152
+ "output_fields": [
153
+ {
154
+ "namespace": "default",
155
+ "name": "employee",
156
+ "field": "monthlySalary",
157
+ "transformation": "Multiplied by 12"
158
+ }
159
+ ]
160
+ }
161
+
162
+ ---
163
+
164
+ Positive Example 2:
165
+
166
+ Input Java:
167
+ user.setFullName(user.getFirstName().toUpperCase() + " " + user.getLastName());
168
+
169
+ Expected Output:
170
+ {
171
+ "output_fields": [
172
+ {
173
+ "namespace": "default",
174
+ "name": "user",
175
+ "field": "firstName",
176
+ "transformation": "Concatenation with space; UPPER applied to first name"
177
+ },
178
+ {
179
+ "namespace": "default",
180
+ "name": "user",
181
+ "field": "lastName",
182
+ "transformation": "Concatenation with space; UPPER applied to last name"
183
+ }
184
+ ]
185
+ }
186
+
187
+
188
+
189
+ ---
190
+
191
+ Negative Example 1 (Incorrect: Unstructured):
192
+
193
+ {
194
+ "annualSalary": "employee.getMonthlySalary() * 12"
195
+ }
196
+
197
+ Reason: This is a raw expression and doesn’t explain the transformation clearly or follow the expected schema.
198
+
199
+ ---
200
+
201
+ Negative Example 2 (Incorrect: Missing logic):
202
+
203
+ Input Java:
204
+ invoice.setTax(invoice.getIncome() * 0.3);
205
+
206
+ Incorrect Output:
207
+ {
208
+ "output_fields": [
209
+ {
210
+ "name": "tax",
211
+ "source": "invoice.getIncome()",
212
+ "transformation": "Direct"
213
+ }
214
+ ]
215
+ }
216
+
217
+ Reason: Transformation logic must describe that it was "Multiplied by 0.3", not just "Direct".
218
+ """
219
+
220
+
221
+
222
+ def java_lineage_operation_tracing():
223
+ return """
224
+ You are a Java logical operator analysis expert. Your task is to analyze Java code (typically using Streams, custom filter logic, or data transformation libraries) and extract all **logical operations** applied to data structures such as lists, maps, or custom data models, including:
225
+
226
+ - Only list the fields involved in logical operations, not all fields.
227
+ - WHERE-like filters (e.g., `.filter()`, `if` conditions inside loops)
228
+ - JOIN conditions (e.g., matching fields from two objects)
229
+ - GROUP BY and aggregation keys (e.g., `.collect(groupingBy(...))`)
230
+ - Filtering after grouping (e.g., filtering a grouped map)
231
+ - Sorting operations (e.g., `.sorted(Comparator.comparing(...))`)
232
+ - Any logical expressions affecting element selection (e.g., `.anyMatch()`, `Predicate`, custom boolean-returning lambdas)
233
+
234
+ Return the result in the following structured format:
235
+
236
+ {
237
+ "output_fields": [
238
+ {
239
+ "source_structure": "<list_or_collection_variable_name>",
240
+ "source_fields": ["<field_1>", "<field_2>", "..."],
241
+ "logical_operators": {
242
+ "filters": [],
243
+ "joins": [],
244
+ "group_by": [],
245
+ "having": [],
246
+ "order_by": [],
247
+ "other": []
248
+ }
249
+ }
250
+ ]
251
+ }
252
+
253
+ - Only include entries for logical operators if the list is non-empty.
254
+ - Represent conditions and expressions fully and clearly.
255
+ - Normalize filters and joins (e.g., `e.getAge() > 18`, `emp.getDeptId() == dept.getId()`)
256
+ - Include all source collections involved and only the fields used in logical operations.
257
+
258
+ ---
259
+
260
+ Positive Example 1:
261
+
262
+ Input Java:
263
+ List<Employee> filtered = employees.stream()
264
+ .filter(e -> e.getRegion().equals("US"))
265
+ .collect(Collectors.toList());
266
+
267
+ Map<String, Double> grouped = filtered.stream()
268
+ .collect(Collectors.groupingBy(Employee::getCustomerId, Collectors.summingDouble(Employee::getAmount)));
269
+
270
+ Map<String, Double> result = grouped.entrySet().stream()
271
+ .filter(entry -> entry.getValue() > 1000)
272
+ .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
273
+
274
+ Expected Output:
275
+ {
276
+ "output_fields": [
277
+ {
278
+ "source_structure": "employees",
279
+ "source_fields": ["region", "customerId", "amount"],
280
+ "logical_operators": {
281
+ "filters": ["e.getRegion().equals(\"US\")", "entry.getValue() > 1000"],
282
+ "group_by": ["Employee::getCustomerId"]
283
+ }
284
+ }
285
+ ]
286
+ }
287
+
288
+ ---
289
+
290
+ Positive Example 2:
291
+
292
+ Input Java:
293
+ List<Merged> merged = employees.stream()
294
+ .flatMap(emp -> departments.stream()
295
+ .filter(dept -> emp.getDeptId() == dept.getId())
296
+ .map(dept -> new Merged(emp, dept)))
297
+ .collect(Collectors.toList());
298
+
299
+ List<Merged> active = merged.stream()
300
+ .filter(m -> m.getStatus().equals("active"))
301
+ .sorted(Comparator.comparing(Merged::getName))
302
+ .collect(Collectors.toList());
303
+
304
+ Expected Output:
305
+ {
306
+ "output_fields": [
307
+ {
308
+ "source_structure": "employees",
309
+ "source_fields": ["deptId", "status", "name"],
310
+ "logical_operators": {
311
+ "joins": ["emp.getDeptId() == dept.getId()"],
312
+ "filters": ["m.getStatus().equals(\"active\")"],
313
+ "order_by": ["Merged::getName"]
314
+ }
315
+ },
316
+ {
317
+ "source_structure": "departments",
318
+ "source_fields": ["id"],
319
+ "logical_operators": {
320
+ "joins": ["emp.getDeptId() == dept.getId()"]
321
+ }
322
+ }
323
+ ]
324
+ }
325
+
326
+ ---
327
+
328
+ Positive Example 3:
329
+
330
+ Input Java:
331
+ List<Account> flagged = accounts.stream()
332
+ .peek(a -> a.setFlag(a.getStatus().equals("closed") ? 1 : 0))
333
+ .collect(Collectors.toList());
334
+
335
+ Expected Output:
336
+ {
337
+ "output_fields": [
338
+ {
339
+ "source_structure": "accounts",
340
+ "source_fields": ["status"],
341
+ "logical_operators": {
342
+ "other": ["a.getStatus().equals(\"closed\") ? 1 : 0"]
343
+ }
344
+ }
345
+ ]
346
+ }
347
+
348
+ ---
349
+
350
+ Negative Example 1 (Incorrect formatting):
351
+
352
+ {
353
+ "filters": "e.getRegion().equals(\"US\")",
354
+ "group_by": "Employee::getCustomerId"
355
+ }
356
+
357
+ Reason: This structure is flat and omits `source_structure`, `source_fields`, and required nesting under `output_fields`.
358
+
359
+ ---
360
+
361
+ Negative Example 2 (Missing logical clause):
362
+
363
+ Input Java:
364
+ List<User> result = users.stream()
365
+ .filter(u -> u.getAge() > 18)
366
+ .sorted(Comparator.comparing(User::getSignupDate))
367
+ .collect(Collectors.toList());
368
+
369
+ Incorrect Output:
370
+ {
371
+ "output_fields": [
372
+ {
373
+ "source_structure": "users",
374
+ "source_fields": ["age"],
375
+ "logical_operators": {
376
+ "filters": ["u.getAge() > 18"]
377
+ }
378
+ }
379
+ ]
380
+ }
381
+
382
+ Reason: The `order_by` clause is missing. `signupDate` must be included in `source_fields` and in `order_by`.
383
+ """
384
+
385
+
386
+
387
+
388
+ def java_lineage_event_composer():
389
+ return """
390
+ You are an OpenLineage lineage generation expert.
391
+
392
+ Your job is to take the outputs from upstream Java data analysis agents and generate a **single, complete OpenLineage event JSON** representing end-to-end data lineage for the transformation pipeline.
393
+
394
+ ---
395
+
396
+ ### You will receive:
397
+
398
+ 1. **Parsed Code Blocks** representing key transformation steps:
399
+ {
400
+ "sp1": { "name": "load_data", "code": "<Java code block>" },
401
+ "sp2": { "name": "filter_data", "code": "<Java code block>" },
402
+ "sp3": { "name": "compute_result", "code": "<Java code block>" }
403
+ }
404
+
405
+ 2. **Field Mappings**: one per code block (same order), in this format:
406
+ [
407
+ {
408
+ "output_fields": [
409
+ {
410
+ "name": "<output_variable_or_field>",
411
+ "source": "<input_field(s) or variable(s)>",
412
+ "transformation": "<description of logic>"
413
+ }
414
+ ]
415
+ },
416
+ ...
417
+ ]
418
+
419
+ 3. **Logical Operators**: one per code block (same order), in this format:
420
+ [
421
+ {
422
+ "output_fields": [
423
+ {
424
+ "source_structure": "<collection_name_or_stream_variable>",
425
+ "source_fields": ["field1", "field2"],
426
+ "logical_operators": {
427
+ "filters": ["..."],
428
+ "joins": ["..."],
429
+ "group_by": ["..."],
430
+ "having": ["..."],
431
+ "order_by": ["..."],
432
+ "other": ["..."]
433
+ }
434
+ }
435
+ ]
436
+ },
437
+ ...
438
+ ]
439
+
440
+ ---
441
+
442
+ ### Your Task:
443
+
444
+ Generate **one event JSON** that captures the **entire pipeline** from raw source data to final derived outputs.
445
+
446
+ Strictly follow the structure below and do not change field names or nesting. It is **very important** to keep the exact same format:
447
+
448
+ - Use `"inputs"` and `"outputs"` as array keys (do NOT use `inputDataset` or `outputDataset`)
449
+ - Preserve `"facets"` blocks under `"job"`, `"inputs"`, and `"outputs"`
450
+ - Include `"columnLineage"` as a facet under `"outputs.facets"` (not at the top level)
451
+ - Maintain the exact field names:
452
+ - `"eventType"`, `"eventTime"`, `"run"`, `"job"`, `"inputs"`, `"outputs"`, `"facets"`, `"query"`, `"processingType"`, `"integration"`, etc.
453
+ 3. you show have all the fields mentioned in following json schema.
454
+ 4. Based on following examples generate <INPUT_NAMESPACE>, <INPUT_NAME>, <OUTPUT_NAMESPACE>, <OUTPUT_NAME> for Java code patterns (pure Java I/O, JDBC, Hibernate/JPA):
455
+
456
+ Pure Java (read file via NIO)
457
+ List<String> lines = java.nio.file.Files.readAllLines(java.nio.file.Paths.get("/data/raw/customers.csv"));
458
+ Expected:
459
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
460
+ <INPUT_NAME> or <OUTPUT_NAME>: file./data/raw/customers.csv
461
+
462
+ Pure Java (write file)
463
+ java.nio.file.Files.write(java.nio.file.Paths.get("/data/curated/sales_curated.csv"), bytes);
464
+ Expected:
465
+ <OUTPUT_NAMESPACE>: default
466
+ <OUTPUT_NAME>: file./data/curated/sales_curated.csv
467
+
468
+ In-memory collections/objects
469
+ List<Customer> customers = new ArrayList<>();
470
+ Expected:
471
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: temp
472
+ <INPUT_NAME> or <OUTPUT_NAME>: customers
473
+
474
+ JDBC (PostgreSQL) with explicit schema.table
475
+ String sql = "SELECT * FROM analytics.orders";
476
+ try (Connection c = DriverManager.getConnection("jdbc:postgresql://host:5432/db");
477
+ Statement s = c.createStatement();
478
+ ResultSet rs = s.executeQuery(sql))
479
+ Expected:
480
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
481
+ <INPUT_NAME> or <OUTPUT_NAME>: analytics.orders
482
+
483
+ JDBC (MySQL) database.table
484
+ String sql = "SELECT u.id, u.email FROM ecommerce.users u";
485
+ try (Connection c = DriverManager.getConnection("jdbc:mysql://host:3306/shop");
486
+ Statement s = c.createStatement();
487
+ ResultSet rs = s.executeQuery(sql))
488
+ Expected:
489
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
490
+ <INPUT_NAME> or <OUTPUT_NAME>: ecommerce.users
491
+
492
+ JDBC (SQL Server) database.schema.table
493
+ String sql = "SELECT * FROM sales.dbo.orders";
494
+ try (Connection c = DriverManager.getConnection("jdbc:sqlserver://host;databaseName=sales");
495
+ Statement s = c.createStatement();
496
+ ResultSet rs = s.executeQuery(sql))
497
+ Expected:
498
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: sales
499
+ <INPUT_NAME> or <OUTPUT_NAME>: dbo.orders
500
+
501
+ JDBC (Oracle) schema.table
502
+ String sql = "SELECT * FROM HR.EMPLOYEES";
503
+ try (Connection c = DriverManager.getConnection("jdbc:oracle:thin:@//host:1521/ORCLPDB1");
504
+ Statement s = c.createStatement();
505
+ ResultSet rs = s.executeQuery(sql))
506
+ Expected:
507
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
508
+ <INPUT_NAME> or <OUTPUT_NAME>: HR.EMPLOYEES
509
+
510
+ Hibernate / JPA (Entity with schema)
511
+ @Entity
512
+ @Table(name = "orders", schema = "sales")
513
+ class Order { ... }
514
+ Expected:
515
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
516
+ <INPUT_NAME> or <OUTPUT_NAME>: sales.orders
517
+
518
+ Hibernate / JPA (Entity without schema; default schema)
519
+ @Entity
520
+ @Table(name = "customers")
521
+ class Customer { ... }
522
+ Expected:
523
+ <INPUT_NAMESPACE> or <OUTPUT_NAMESPACE>: default
524
+ <INPUT_NAME> or <OUTPUT_NAME>: customers
525
+
526
+ JDBC write (INSERT into schema.table)
527
+ String sql = "INSERT INTO analytics.daily_metrics (run_date, total) VALUES (?, ?)";
528
+ Expected:
529
+ <OUTPUT_NAMESPACE>: default
530
+ <OUTPUT_NAME>: analytics.daily_metrics
531
+
532
+ Notes:
533
+ - Use scheme prefixes for path-like sources/targets when present:
534
+ file./absolute/or/relative/path
535
+ s3./bucket/key
536
+ gs./bucket/key
537
+ abfs./container/path
538
+ - For in-memory variables/collections, use:
539
+ <NAMESPACE> = temp
540
+ <NAME> = <variable_or_field_name>
541
+ - For relational sources/targets referenced via SQL, prefer <NAME> = <schema.table>. If a database/catalog prefix exists (e.g., SQL Server), map it to <NAMESPACE> and keep <NAME> = <schema.table>. Otherwise use <NAMESPACE> = default.
542
+ - Wherever you can't find information for <STORAGE_LAYER>, <FILE_FORMAT>, <DATASET_TYPE>, <SUB_TYPE>, <LIFECYCLE>, <OWNER_NAME>, <OWNER_TYPE>, <SUBTYPE>, <DESCRIPTION> then write "NA".
543
+ - Very important: Your output must follow exactly the specified JSON structure — do not output explanations, comments, or anything else.
544
+
545
+
546
+ - wherever you cant find information for example for <STORAGE_LAYER>, <FILE_FORMAT>,
547
+ <DATASET_TYPE>, <SUB_TYPE>, <LIFECYCLE>, <OWNER_NAME>,
548
+ <OWNER_TYPE>, <SUBTYPE>, <DESCRIPTION> then just write "NA".
549
+
550
+ - very very very important: Your output must follow **exactly** this JSON structure — do not output explanations, comments, or anything else.
551
+ ---
552
+
553
+ ### Required Output Format (Example):
554
+
555
+ {
556
+ "inputs": [
557
+ {
558
+ "namespace": "<INPUT_NAMESPACE>",
559
+ "name": "<INPUT_NAME>",
560
+ "facets": {
561
+ "schema": {
562
+ "fields": [
563
+ {
564
+ "name": "<FIELD_NAME>",
565
+ "type": "<FIELD_TYPE>",
566
+ "description": "<FIELD_DESCRIPTION>"
567
+ }
568
+ ]
569
+ }
570
+ }
571
+ }
572
+ ],
573
+ "outputs": [
574
+ {
575
+ "namespace": "<OUTPUT_NAMESPACE>",
576
+ "name": "<OUTPUT_NAME>",
577
+ "facets": {
578
+ "columnLineage": {
579
+ "fields": {
580
+ "<OUTPUT_FIELD_NAME>": {
581
+ "inputFields": [
582
+ {
583
+ "namespace": "<INPUT_NAMESPACE>",
584
+ "name": "<INPUT_NAME>",
585
+ "field": "<INPUT_FIELD_NAME>",
586
+ "transformations": [
587
+ {
588
+ "type": "<TRANSFORMATION_TYPE>",
589
+ "subtype": "<SUBTYPE>",
590
+ "description": "<DESCRIPTION>",
591
+ "masking": false
592
+ }
593
+ ]
594
+ }
595
+ ]
596
+ }
597
+ }
598
+ }
599
+ }
600
+ }
601
+ ]
602
+ }
603
+
604
+ 5. Return only results in above mentioned json schema format. do not add any text.
605
+ """
lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_params.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv(override=True)
5
+
6
+ # java_lineage_agent mcp server params
7
+ java_mcp_server_params = [
8
+ {"command": "python", "args": ["lf_algorithm/plugins/java_lineage_agent/mcp_servers/mcp_java_lineage/lineage_java_server.py"]},
9
+ ]
lf_algorithm/plugins/python_lineage_agent/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
lf_algorithm/plugins/python_lineage_agent/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (208 Bytes). View file
 
lf_algorithm/plugins/python_lineage_agent/__pycache__/lineage_agent.cpython-313.pyc ADDED
Binary file (5.88 kB). View file
 
lf_algorithm/plugins/python_lineage_agent/__pycache__/python_instructions.cpython-313.pyc ADDED
Binary file (5.21 kB). View file
 
lf_algorithm/plugins/python_lineage_agent/lineage_agent.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+ from contextlib import AsyncExitStack
5
+ from agents import Agent, Tool, Runner, trace
6
+ from agents.mcp.server import MCPServerStdio
7
+ from typing import Dict, Any, Optional
8
+
9
+ from ...utils.tracers import log_trace_id
10
+ from ...plugins.python_lineage_agent.python_instructions import comprehensive_analysis_instructions
11
+ from ...plugins.python_lineage_agent.mcp_servers.mcp_params import python_mcp_server_params
12
+ from ...utils.file_utils import dump_json_record
13
+
14
+ # Get logger for this module
15
+ logger = logging.getLogger(__name__)
16
+
17
+ MAX_TURNS = 30 # Increased for comprehensive analysis
18
+
19
+
20
+ class PythonLineageAgent:
21
+ """Plugin agent for Python lineage analysis"""
22
+
23
+ def __init__(self, agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None):
24
+ self.agent_name = agent_name
25
+ self.model_name = model_name
26
+ self.source_code = source_code
27
+ self.get_model_func = get_model_func
28
+
29
+ async def create_agent(self, python_mcp_servers) -> Agent:
30
+ # Use the passed get_model_func or fall back to the centralized one
31
+ if self.get_model_func:
32
+ model = self.get_model_func(self.model_name)
33
+ else:
34
+ from ...utils import get_model
35
+ model = get_model(self.model_name)
36
+
37
+ agent = Agent(
38
+ name=self.agent_name,
39
+ instructions=comprehensive_analysis_instructions(self.agent_name),
40
+ model=model,
41
+ mcp_servers=python_mcp_servers,
42
+ )
43
+ return agent
44
+
45
+ async def run_agent(self, python_mcp_servers, source_code: str):
46
+ # Create single agent for comprehensive analysis
47
+ comprehensive_agent = await self.create_agent(python_mcp_servers)
48
+
49
+ # Run the complete analysis in one go
50
+ result = await Runner.run(comprehensive_agent, source_code, max_turns=MAX_TURNS)
51
+
52
+ # Return the final output
53
+ return dump_json_record(self.agent_name, result.final_output)
54
+
55
+ async def run_with_mcp_servers(self, source_code: str):
56
+ async with AsyncExitStack() as stack:
57
+ python_mcp_servers = [
58
+ await stack.enter_async_context(
59
+ MCPServerStdio(params, client_session_timeout_seconds=120)
60
+ )
61
+ for params in python_mcp_server_params
62
+ ]
63
+ return await self.run_agent(python_mcp_servers, source_code=source_code)
64
+
65
+ async def run_with_trace(self, source_code: str):
66
+ trace_name = f"{self.agent_name}-lineage-agent"
67
+ trace_id = log_trace_id(f"{self.agent_name.lower()}")
68
+ with trace(trace_name, trace_id=trace_id):
69
+ return await self.run_with_mcp_servers(source_code=source_code)
70
+
71
+ async def run(self):
72
+ try:
73
+ logger.info(f"Starting Python lineage analysis for {self.agent_name}")
74
+ result = await self.run_with_trace(self.source_code)
75
+ logger.info(f"Completed Python lineage analysis for {self.agent_name}")
76
+ return result
77
+ except Exception as e:
78
+ logger.error(f"Error running {self.agent_name}: {e}")
79
+ return {"error": str(e)}
80
+
81
+
82
+ # Plugin interface functions
83
+ def create_python_lineage_agent(agent_name: str, source_code: str, model_name: str = "gpt-4o-mini", get_model_func=None) -> PythonLineageAgent:
84
+ """Factory function to create a PythonLineageAgent instance"""
85
+ return PythonLineageAgent(agent_name=agent_name, source_code=source_code, model_name=model_name, get_model_func=get_model_func)
86
+
87
+
88
+ def get_plugin_info() -> Dict[str, Any]:
89
+ """Return plugin metadata"""
90
+ return {
91
+ "name": "python-lineage-agent",
92
+ "description": "Python lineage analysis agent for parsing and analyzing Python queries",
93
+ "version": "1.0.0",
94
+ "author": "Ali Shamsaddinlou",
95
+ "agent_class": PythonLineageAgent,
96
+ "factory_function": create_python_lineage_agent,
97
+ }
lf_algorithm/plugins/python_lineage_agent/mcp_servers/__init__.py ADDED
File without changes