FormatReview / rule_extractor.py
Stephen Zweibel
Update app for Hugging Face
af140e4
raw
history blame
7.35 kB
import logging
import asyncio
import nest_asyncio
import os
import json
import httpx
from config import settings
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
class FormattingRules(BaseModel):
"""Schema for formatting rules extraction"""
margins: str = Field(description="Margin requirements for the manuscript")
font: str = Field(description="Font requirements including size, type, etc.")
line_spacing: str = Field(description="Line spacing requirements")
citations: str = Field(description="Citation style and formatting requirements")
sections: str = Field(description="Required sections and their structure")
other_rules: str = Field(description="Any other formatting requirements")
summary: str = Field(description="A brief summary of the key formatting requirements")
def format_rules_for_display(rules_data):
"""
Format the extracted rules data into a readable markdown string.
"""
if not rules_data:
return "Could not extract formatting rules from the provided URL."
formatted_rules = f"""
# Manuscript Formatting Guidelines
## Margins
{rules_data.get('margins', 'Not specified')}
## Font
{rules_data.get('font', 'Not specified')}
## Line Spacing
{rules_data.get('line_spacing', 'Not specified')}
## Citations
{rules_data.get('citations', 'Not specified')}
## Section Structure
{rules_data.get('sections', 'Not specified')}
## Other Requirements
{rules_data.get('other_rules', 'Not specified')}
## Summary
{rules_data.get('summary', 'Not specified')}
"""
return formatted_rules
def get_rules_from_url(url: str) -> str:
"""
Extracts formatting rules from a given URL using crawl4ai.
"""
logger.info(f"Extracting rules from URL: {url}")
# Apply nest_asyncio here, when the function is called
nest_asyncio.apply()
# Import crawl4ai modules here to avoid event loop issues at module level
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
async def _extract_rules_async(url: str) -> str:
"""
Asynchronously extracts formatting rules from a given URL using crawl4ai.
"""
# Configure the browser
browser_config = BrowserConfig(verbose=True)
# Configure the LLM extraction
extraction_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider=f"{settings.llm_provider}/{settings.llm_model_name}",
api_token=settings.openrouter_api_key
),
schema=FormattingRules.schema(),
extraction_type="schema",
instruction="""
From the crawled content, extract all formatting rules for manuscript submissions.
Focus on requirements for margins, font, line spacing, citations, section structure,
and any other formatting guidelines. Provide a comprehensive extraction of all
formatting-related information.
If a specific requirement is not mentioned in the content, include "Not specified" in the corresponding field.
"""
)
# Configure the crawler
run_config = CrawlerRunConfig(
word_count_threshold=10,
exclude_external_links=True,
process_iframes=True,
remove_overlay_elements=True,
exclude_social_media_links=True,
check_robots_txt=True,
semaphore_count=3,
extraction_strategy=extraction_strategy
)
# Initialize the crawler and run
async with AsyncWebCrawler() as crawler:
try:
result = await crawler.arun(
url=url,
config=run_config
)
logger.info(f"Crawler result for {url}: {result}")
# Handle robots.txt blocking
if not result.success and "robots.txt" in str(result.error_message):
logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.")
try:
with httpx.Client() as client:
response = client.get(url, follow_redirects=True)
response.raise_for_status()
raw_html = response.text
logger.info(f"Successfully downloaded HTML content for {url}.")
# Re-run crawl4ai with raw HTML
raw_html_url = f"raw:{raw_html}"
result = await crawler.arun(url=raw_html_url, config=run_config)
logger.info(f"Crawler result for raw HTML: {result}")
except httpx.HTTPStatusError as e:
logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True)
return "Failed to download the page content after being blocked by robots.txt."
except Exception as e:
logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True)
return "An error occurred during the fallback extraction process."
except Exception as e:
logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True)
return "An error occurred while trying to extract formatting rules."
if result.success and result.extracted_content:
# Format the extracted data into a readable string
if isinstance(result.extracted_content, list) and len(result.extracted_content) > 0:
rules_data = result.extracted_content[0]
elif isinstance(result.extracted_content, dict):
rules_data = result.extracted_content
else:
# If it's a string or other type, use markdown as fallback
return str(result.extracted_content) if result.extracted_content else result.markdown if result.markdown else "Could not extract formatting rules from the provided URL."
# Store the raw data for debugging
logger.info(f"Extracted rules data: {json.dumps(rules_data, indent=2)}")
# Format the rules for display
formatted_rules = format_rules_for_display(rules_data)
logger.info(f"Formatted rules: {formatted_rules[:100]}...") # Log for debugging
return formatted_rules
elif result.success and result.markdown:
# Fallback to markdown if structured extraction fails
logger.info(f"Extraction failed, falling back to markdown for {url}")
return result.markdown
else:
logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}")
return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
# Run the async function using the patched event loop
return asyncio.run(_extract_rules_async(url))