ZhouChuYue
w3m
06dc834
raw
history blame
12.4 kB
# -*- coding: utf-8 -*-
"""
UltraData Math Parser - Hugging Face Space Demo
A unified HTML parser optimized for extracting mathematical content.
"""
import gradio as gr
from ultradata_math_parser import GeneralParser
def parse_html(
html_content: str,
base_url: str = "",
process_math: bool = True,
include_tables: bool = True,
enable_forum_assembly: bool = True,
html_type: str = "unified",
) -> dict:
"""
Parse HTML content using GeneralParser.
Args:
html_content: Raw HTML string to parse
base_url: Base URL for resolving relative links
process_math: Whether to process and convert math expressions
include_tables: Whether to preserve table elements
enable_forum_assembly: Whether to enable forum post assembly
html_type: Parser type (unified/article/forum)
Returns:
Dictionary containing parsed results
"""
if not html_content or not html_content.strip():
return {
"title": "",
"html": "",
"text": "",
"text_length": 0,
"xp_num": "",
"fallback_strategy": "",
"forum_assembled": False,
"error": "Please provide HTML content to parse.",
}
parser = GeneralParser()
try:
result = parser.extract(
html=html_content,
base_url=base_url,
process_math=process_math,
include_tables=include_tables,
enable_forum_assembly=enable_forum_assembly,
html_type=html_type,
)
return {
"title": result.get("title", ""),
"html": result.get("html", ""),
"text": result.get("text", ""),
"text_length": result.get("text_length", 0),
"xp_num": result.get("xp_num", ""),
"fallback_strategy": result.get("fallback_strategy", ""),
"forum_assembled": result.get("forum_assembled", False),
"error": None,
}
except Exception as e:
return {
"title": "",
"html": "",
"text": "",
"text_length": 0,
"xp_num": "",
"fallback_strategy": "",
"forum_assembled": False,
"error": str(e),
}
def format_output(result: dict) -> tuple:
"""Format the parser output for Gradio display."""
if result.get("error"):
return (
f"❌ Error: {result['error']}",
"",
"",
"",
"",
)
# Build metadata string
metadata = f"""πŸ“Š **Parsing Statistics**
- **Title**: {result['title'] or 'N/A'}
- **Text Length**: {result['text_length']} characters
- **XPath Match**: {result['xp_num']}
- **Fallback Strategy**: {result['fallback_strategy']}
- **Forum Assembled**: {'βœ… Yes' if result['forum_assembled'] else '❌ No'}
"""
return (
metadata,
result.get("title", ""),
result.get("html", ""),
result.get("text", ""),
result.get("html", ""), # For HTML preview
)
def process_input(html_content, base_url, process_math, include_tables, enable_forum, html_type):
"""Main processing function for Gradio interface."""
result = parse_html(
html_content=html_content,
base_url=base_url,
process_math=process_math,
include_tables=include_tables,
enable_forum_assembly=enable_forum,
html_type=html_type,
)
return format_output(result)
# Example HTML content for demo
EXAMPLE_HTML = """<!DOCTYPE html>
<html>
<head>
<title>Quadratic Formula Example</title>
</head>
<body>
<article class="post-content">
<h1>Understanding the Quadratic Formula</h1>
<p>The quadratic formula is used to solve equations of the form axΒ² + bx + c = 0.</p>
<p>The solution is given by:</p>
<math xmlns="http://www.w3.org/1998/Math/MathML">
<mi>x</mi>
<mo>=</mo>
<mfrac>
<mrow>
<mo>-</mo>
<mi>b</mi>
<mo>Β±</mo>
<msqrt>
<mrow>
<msup><mi>b</mi><mn>2</mn></msup>
<mo>-</mo>
<mn>4</mn>
<mi>a</mi>
<mi>c</mi>
</mrow>
</msqrt>
</mrow>
<mrow>
<mn>2</mn>
<mi>a</mi>
</mrow>
</mfrac>
</math>
<p>Where a, b, and c are coefficients of the quadratic equation.</p>
<h2>Example Problem</h2>
<p>Solve: xΒ² - 5x + 6 = 0</p>
<p>Here, a = 1, b = -5, c = 6</p>
<p>Using the formula: x = (5 ± √(25-24))/2 = (5 ± 1)/2</p>
<p>Therefore, x = 3 or x = 2</p>
</article>
<footer>
<nav>Related articles...</nav>
</footer>
</body>
</html>"""
# Custom CSS for better aesthetics
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&family=Space+Grotesk:wght@400;500;600;700&display=swap');
.gradio-container {
font-family: 'Space Grotesk', sans-serif !important;
background: linear-gradient(135deg, #0f0f23 0%, #1a1a3e 50%, #0f0f23 100%) !important;
min-height: 100vh;
}
.main-title {
font-family: 'Space Grotesk', sans-serif !important;
font-weight: 700 !important;
font-size: 2.5rem !important;
background: linear-gradient(90deg, #00d4ff, #7c3aed, #f472b6) !important;
-webkit-background-clip: text !important;
-webkit-text-fill-color: transparent !important;
background-clip: text !important;
text-align: center !important;
margin-bottom: 0.5rem !important;
}
.subtitle {
text-align: center !important;
color: #94a3b8 !important;
font-size: 1.1rem !important;
margin-bottom: 2rem !important;
}
.gr-box {
border-radius: 12px !important;
border: 1px solid rgba(124, 58, 237, 0.3) !important;
background: rgba(15, 15, 35, 0.8) !important;
backdrop-filter: blur(10px) !important;
}
.gr-input, .gr-textarea {
font-family: 'JetBrains Mono', monospace !important;
background: rgba(30, 30, 60, 0.6) !important;
border: 1px solid rgba(124, 58, 237, 0.4) !important;
border-radius: 8px !important;
color: #e2e8f0 !important;
}
.gr-button-primary {
background: linear-gradient(135deg, #7c3aed 0%, #00d4ff 100%) !important;
border: none !important;
font-weight: 600 !important;
font-size: 1rem !important;
padding: 12px 32px !important;
border-radius: 8px !important;
transition: all 0.3s ease !important;
text-transform: uppercase !important;
letter-spacing: 1px !important;
}
.gr-button-primary:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 25px rgba(124, 58, 237, 0.4) !important;
}
.gr-button-secondary {
background: transparent !important;
border: 2px solid rgba(124, 58, 237, 0.5) !important;
color: #a78bfa !important;
font-weight: 500 !important;
border-radius: 8px !important;
}
.section-header {
color: #00d4ff !important;
font-weight: 600 !important;
font-size: 1.2rem !important;
margin-bottom: 1rem !important;
padding-bottom: 0.5rem !important;
border-bottom: 2px solid rgba(0, 212, 255, 0.3) !important;
}
.output-box {
background: rgba(20, 20, 45, 0.9) !important;
border: 1px solid rgba(0, 212, 255, 0.3) !important;
border-radius: 12px !important;
padding: 1rem !important;
}
.gr-markdown {
color: #e2e8f0 !important;
}
.gr-markdown code {
background: rgba(124, 58, 237, 0.2) !important;
padding: 2px 6px !important;
border-radius: 4px !important;
font-family: 'JetBrains Mono', monospace !important;
}
footer {
display: none !important;
}
.gr-accordion {
border: 1px solid rgba(124, 58, 237, 0.3) !important;
border-radius: 8px !important;
background: rgba(20, 20, 45, 0.6) !important;
}
.gr-check-radio {
accent-color: #7c3aed !important;
}
label {
color: #cbd5e1 !important;
}
"""
# Build Gradio interface
with gr.Blocks(css=custom_css, title="UltraData Math Parser") as demo:
gr.HTML('<h1 class="main-title">πŸ“ UltraData Math Parser</h1>')
gr.HTML('<p class="subtitle">Unified HTML Parser for Mathematical Content Extraction</p>')
with gr.Row():
with gr.Column(scale=1):
gr.HTML('<div class="section-header">πŸ“₯ Input</div>')
html_input = gr.Textbox(
label="HTML Content",
placeholder="Paste your HTML content here...",
lines=15,
max_lines=30,
value=EXAMPLE_HTML,
)
base_url_input = gr.Textbox(
label="Base URL (Optional)",
placeholder="https://example.com/page",
lines=1,
)
with gr.Accordion("βš™οΈ Advanced Options", open=False):
html_type = gr.Radio(
choices=["unified", "article", "forum"],
value="unified",
label="Parser Type",
info="Select the parsing strategy",
)
process_math = gr.Checkbox(
label="Process Math Expressions",
value=True,
info="Convert MathML and LaTeX to unified format",
)
include_tables = gr.Checkbox(
label="Include Tables",
value=True,
info="Preserve table elements in output",
)
enable_forum = gr.Checkbox(
label="Enable Forum Assembly",
value=True,
info="Assemble forum posts and comments",
)
with gr.Row():
parse_btn = gr.Button("πŸš€ Parse HTML", variant="primary", size="lg")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary", size="lg")
with gr.Column(scale=1):
gr.HTML('<div class="section-header">πŸ“€ Output</div>')
metadata_output = gr.Markdown(
label="Parsing Statistics",
elem_classes=["output-box"],
)
title_output = gr.Textbox(
label="Extracted Title",
lines=1,
interactive=False,
)
with gr.Tabs():
with gr.TabItem("πŸ“ Raw HTML"):
html_output = gr.Textbox(
label="Extracted HTML",
lines=12,
max_lines=20,
interactive=False,
)
with gr.TabItem("πŸ“„ Plain Text"):
text_output = gr.Textbox(
label="Plain Text (w3m rendered)",
lines=12,
max_lines=20,
interactive=False,
)
with gr.TabItem("πŸ‘οΈ Preview"):
preview_output = gr.HTML(
label="HTML Preview",
)
# Event handlers
parse_btn.click(
fn=process_input,
inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type],
outputs=[metadata_output, title_output, html_output, text_output, preview_output],
)
def clear_all():
return "", "", "", "", "", "", ""
clear_btn.click(
fn=clear_all,
outputs=[html_input, base_url_input, metadata_output, title_output, html_output, text_output, preview_output],
)
# Footer info
gr.HTML("""
<div style="text-align: center; margin-top: 2rem; padding: 1rem; color: #64748b; font-size: 0.9rem;">
<p>πŸ”¬ <strong>UltraData Math Parser</strong> - Part of the UltraData-Math Project</p>
<p>Specialized in extracting mathematical content from web pages with MathML, LaTeX, and formula support.</p>
</div>
""")
if __name__ == "__main__":
demo.launch()