Spaces:
Sleeping
Sleeping
| import logging | |
| import asyncio | |
| import nest_asyncio | |
| import os | |
| import json | |
| import httpx | |
| from config import settings | |
| from pydantic import BaseModel, Field | |
| logger = logging.getLogger(__name__) | |
| class FormattingRules(BaseModel): | |
| """Schema for formatting rules extraction""" | |
| margins: str = Field(description="Margin requirements for the manuscript") | |
| font: str = Field(description="Font requirements including size, type, etc.") | |
| line_spacing: str = Field(description="Line spacing requirements") | |
| citations: str = Field(description="Citation style and formatting requirements") | |
| sections: str = Field(description="Required sections and their structure") | |
| other_rules: str = Field(description="Any other formatting requirements") | |
| summary: str = Field(description="A brief summary of the key formatting requirements") | |
| def format_rules_for_display(rules_data): | |
| """ | |
| Format the extracted rules data into a readable markdown string. | |
| """ | |
| if not rules_data: | |
| return "Could not extract formatting rules from the provided URL." | |
| formatted_rules = f""" | |
| # Manuscript Formatting Guidelines | |
| ## Margins | |
| {rules_data.get('margins', 'Not specified')} | |
| ## Font | |
| {rules_data.get('font', 'Not specified')} | |
| ## Line Spacing | |
| {rules_data.get('line_spacing', 'Not specified')} | |
| ## Citations | |
| {rules_data.get('citations', 'Not specified')} | |
| ## Section Structure | |
| {rules_data.get('sections', 'Not specified')} | |
| ## Other Requirements | |
| {rules_data.get('other_rules', 'Not specified')} | |
| ## Summary | |
| {rules_data.get('summary', 'Not specified')} | |
| """ | |
| return formatted_rules | |
| def get_rules_from_url(url: str) -> str: | |
| """ | |
| Extracts formatting rules from a given URL using crawl4ai. | |
| """ | |
| logger.info(f"Extracting rules from URL: {url}") | |
| # Apply nest_asyncio here, when the function is called | |
| nest_asyncio.apply() | |
| # Import crawl4ai modules here to avoid event loop issues at module level | |
| from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, LLMConfig | |
| from crawl4ai.extraction_strategy import LLMExtractionStrategy | |
| async def _extract_rules_async(url: str) -> str: | |
| """ | |
| Asynchronously extracts formatting rules from a given URL using crawl4ai. | |
| """ | |
| # Configure the browser | |
| browser_config = BrowserConfig(verbose=True) | |
| # Configure the LLM extraction | |
| extraction_strategy = LLMExtractionStrategy( | |
| llm_config=LLMConfig( | |
| provider=f"{settings.llm_provider}/{settings.llm_model_name}", | |
| api_token=settings.openrouter_api_key | |
| ), | |
| schema=FormattingRules.schema(), | |
| extraction_type="schema", | |
| instruction=""" | |
| From the crawled content, extract all formatting rules for manuscript submissions. | |
| Focus on requirements for margins, font, line spacing, citations, section structure, | |
| and any other formatting guidelines. Provide a comprehensive extraction of all | |
| formatting-related information. | |
| If a specific requirement is not mentioned in the content, include "Not specified" in the corresponding field. | |
| """ | |
| ) | |
| # Configure the crawler | |
| run_config = CrawlerRunConfig( | |
| word_count_threshold=10, | |
| exclude_external_links=True, | |
| process_iframes=True, | |
| remove_overlay_elements=True, | |
| exclude_social_media_links=True, | |
| check_robots_txt=True, | |
| semaphore_count=3, | |
| extraction_strategy=extraction_strategy | |
| ) | |
| # Initialize the crawler and run | |
| async with AsyncWebCrawler() as crawler: | |
| try: | |
| result = await crawler.arun( | |
| url=url, | |
| config=run_config | |
| ) | |
| logger.info(f"Crawler result for {url}: {result}") | |
| # Handle robots.txt blocking | |
| if not result.success and "robots.txt" in str(result.error_message): | |
| logger.warning(f"Crawl blocked by robots.txt for {url}. Falling back to direct download.") | |
| try: | |
| with httpx.Client() as client: | |
| response = client.get(url, follow_redirects=True) | |
| response.raise_for_status() | |
| raw_html = response.text | |
| logger.info(f"Successfully downloaded HTML content for {url}.") | |
| # Re-run crawl4ai with raw HTML | |
| raw_html_url = f"raw:{raw_html}" | |
| result = await crawler.arun(url=raw_html_url, config=run_config) | |
| logger.info(f"Crawler result for raw HTML: {result}") | |
| except httpx.HTTPStatusError as e: | |
| logger.error(f"HTTP error while fetching {url}: {e}", exc_info=True) | |
| return "Failed to download the page content after being blocked by robots.txt." | |
| except Exception as e: | |
| logger.error(f"An error occurred during fallback processing for {url}: {e}", exc_info=True) | |
| return "An error occurred during the fallback extraction process." | |
| except Exception as e: | |
| logger.error(f"An error occurred during crawling {url}: {e}", exc_info=True) | |
| return "An error occurred while trying to extract formatting rules." | |
| if result.success and result.extracted_content: | |
| # Format the extracted data into a readable string | |
| if isinstance(result.extracted_content, list) and len(result.extracted_content) > 0: | |
| rules_data = result.extracted_content[0] | |
| elif isinstance(result.extracted_content, dict): | |
| rules_data = result.extracted_content | |
| else: | |
| # If it's a string or other type, use markdown as fallback | |
| return str(result.extracted_content) if result.extracted_content else result.markdown if result.markdown else "Could not extract formatting rules from the provided URL." | |
| # Store the raw data for debugging | |
| logger.info(f"Extracted rules data: {json.dumps(rules_data, indent=2)}") | |
| # Format the rules for display | |
| formatted_rules = format_rules_for_display(rules_data) | |
| logger.info(f"Formatted rules: {formatted_rules[:100]}...") # Log for debugging | |
| return formatted_rules | |
| elif result.success and result.markdown: | |
| # Fallback to markdown if structured extraction fails | |
| logger.info(f"Extraction failed, falling back to markdown for {url}") | |
| return result.markdown | |
| else: | |
| logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}") | |
| return "Could not extract formatting rules from the provided URL. The crawler did not return any content." | |
| # Run the async function using the patched event loop | |
| return asyncio.run(_extract_rules_async(url)) | |