| """ |
| Main pipeline for Paper2ProjectPage. |
| Integrates all modules to generate project pages from research papers. |
| """ |
|
|
| import argparse |
| import json |
| import os |
| import time |
| from dotenv import load_dotenv |
| from pathlib import Path |
| import shutil |
| from ProjectPageAgent.parse_paper import parse_paper_for_project_page, save_parsed_content |
| from ProjectPageAgent.html_finder import HtmlFinder |
| from ProjectPageAgent.content_planner import ProjectPageContentPlanner |
| from ProjectPageAgent.html_generator import ProjectPageHTMLGenerator,to_url |
| from utils.wei_utils import get_agent_config |
| from ProjectPageAgent.content_planner import filter_references |
| from utils.src.utils import run_sync_screenshots |
|
|
| load_dotenv() |
|
|
| def matching(requirement): |
| weight = { |
| "background_color": 1.0, |
| "has_hero_section": 0.75, |
| "Page density": 0.85, |
| "image_layout": 0.65, |
| "title_color": 0.6, |
| "has_navigation": 0.7 |
| } |
| with open('tags.json', 'r') as f: |
| template_tags = json.load(f) |
|
|
| points = {} |
| for name, tag in template_tags.items(): |
| for feature, value in tag.items(): |
| if requirement[feature] == value: |
| if name not in points.keys(): |
| points[name] = weight[feature] |
| else: |
| points[name] += weight[feature] |
| sorted_points = sorted(points.items(), key=lambda x: x[1], reverse=True) |
| return [template[0] for template in sorted_points[0:3]] |
|
|
| def copy_static_files(template_file_path, template_root_dir, output_dir, paper_name): |
| |
| print(f"Detecting Static files: {template_file_path}") |
| os.makedirs(output_dir, exist_ok=True) |
| |
| |
| project_output_dir = f"{output_dir}/{paper_name}" |
| os.makedirs(project_output_dir, exist_ok=True) |
| |
| |
| static_dir = os.path.join(project_output_dir, 'static') |
| os.makedirs(static_dir, exist_ok=True) |
| |
|
|
| html_relative_path = os.path.relpath(template_file_path, template_root_dir) |
|
|
| |
| if os.path.exists(template_root_dir) and os.path.isdir(template_root_dir): |
| print(f"Found template dir: {template_root_dir}") |
| try: |
| shutil.copytree(template_root_dir, project_output_dir, dirs_exist_ok=True) |
| os.remove(os.path.join(project_output_dir, html_relative_path)) |
| print(f"Copied template to: {project_output_dir}") |
| except Exception as e: |
| print(f"Failed to copy static files: {e}") |
|
|
| try: |
| with open(template_file_path, 'r', encoding='utf-8') as f: |
| html_content = f.read() |
| except Exception as e: |
| print(f"Failed to read template file: {e}") |
| return |
| |
| return static_dir |
|
|
| def main(): |
| """Main pipeline for generating project pages from research papers.""" |
| parser = argparse.ArgumentParser(description='Paper2ProjectPage Generation Pipeline') |
| parser.add_argument('--paper_path', type=str, required=True, help='Path to the research paper PDF') |
| parser.add_argument('--model_name_t', type=str, default='4o', help='Text model name') |
| parser.add_argument('--model_name_v', type=str, default='4o', help='Vision model name') |
| parser.add_argument('--template_root', type=str, default="project_templates", help='Directory containing all templates') |
| parser.add_argument('--template_dir', type=str, help='Directory of chosen template') |
| parser.add_argument('--template_file', type=str, help='Path to a specific template file to use') |
| parser.add_argument('--output_dir', type=str, default='generated_project_pages', help='Output directory for generated pages') |
| parser.add_argument('--style_preference', type=str, default=None, help='Path to style preference JSON file') |
| parser.add_argument('--tmp_dir', type=str, default='tmp', help='Temporary directory') |
| parser.add_argument('--full_content_check_times', type=int, default='0', help='Temporary directory') |
| parser.add_argument('--background_color', type=str, choices=['light', 'dark'], required=True, |
| help='Background color of generated project page') |
| parser.add_argument('--has_navigation', type=str, choices=['yes', 'no'], required=True, |
| help='Is the generated project page has navigation') |
| parser.add_argument('--has_hero_section', type=str, choices=['yes', 'no'], required=True, |
| help='Is the generated project page has hero section') |
| parser.add_argument('--title_color', type=str, choices=['pure', 'colorful'], required=True, |
| help="Is the title's color of the project page is pure or colorful") |
| parser.add_argument('--page_density', type=str, choices=['spacious', 'compact'], required=True, |
| help="The overall spacing tightnessβamount of white space vs. information density") |
| parser.add_argument('--image_layout', type=str, choices=['rotation', 'parallelism'], required=True, |
| help="The dominant arrangement style for images.") |
| parser.add_argument('--html_check_times', type=int, default='1', help='Temporary directory') |
| parser.add_argument( |
| '--resume', |
| type=str, |
| choices=['parse_pdf', 'generate_content','full_content_check', 'generate_html', 'html_check','modify_table','html_feedback'], |
| default='parse_pdf', |
| help="From which step to resume: 'parse_pdf', 'generate_content','full_content_check', 'generate_html', 'html_check','modify_table','html_feedback'", |
| ) |
| parser.add_argument('--human_input', type=str, default='1',choices=['0','1'] ,help='Human input for feedback') |
| |
| args = parser.parse_args() |
|
|
| if not args.template_dir: |
| template_requirement = { |
| "background_color": args.background_color, |
| "has_hero_section": args.has_hero_section, |
| "Page density": args.page_density, |
| "image_layout": args.image_layout, |
| "has_navigation": args.has_navigation, |
| "title_color": args.title_color |
| } |
| matched_template = matching(template_requirement) |
| print('Below is names of the most matching 3 templates:') |
| print(' '.join(matched_template)) |
| template_name = input('Please choose one from them, you can just input the name of your favorite template') |
| while template_name not in matched_template: |
| template_name = input('Please input the correct name of your favorite template!!') |
| args.template_dir = os.path.join(args.template_root, template_name) |
|
|
| |
| if not args.template_file: |
| html_finder_ = HtmlFinder() |
| args.template_file = html_finder_.find_html(args.template_dir) |
|
|
| |
| paper_name = args.paper_path.split('/')[-1].replace('.pdf', '') if '/' in args.paper_path else args.paper_path.replace('.pdf', '') |
| args.paper_name = paper_name |
| |
| print(f"Starting Paper2ProjectPage generation for: {paper_name}") |
| print(f"Paper path: {args.paper_path}") |
| print(f"Models: {args.model_name_t} (text), {args.model_name_v} (vision)") |
| |
| start_time = time.time() |
| total_input_tokens_t = 0 |
| total_output_tokens_t = 0 |
| total_input_tokens_v = 0 |
| total_output_tokens_v = 0 |
| |
| |
| os.makedirs(args.tmp_dir, exist_ok=True) |
| |
| try: |
| |
| agent_config_t = get_agent_config(args.model_name_t) |
| agent_config_v = get_agent_config(args.model_name_v) |
| |
| |
| print("\n" + "="*50) |
| print("STEP 1: Parsing Research Paper") |
| print("="*50) |
|
|
| raw_content_path = f'project_contents/{args.paper_name}_raw_content.json' |
| if not os.path.exists(raw_content_path): |
| print(f"Raw content does not exist at {raw_content_path}") |
|
|
|
|
| input_token, output_token, raw_result, images, tables = parse_paper_for_project_page(args, agent_config_t) |
| total_input_tokens_t += input_token |
| total_output_tokens_t += output_token |
| |
| |
| raw_content_path, token_log_path = save_parsed_content(args, raw_result, images, tables, input_token, output_token) |
| |
| |
| with open(raw_content_path, 'r') as f: |
| paper_content = json.load(f) |
| else: |
| print(f"Loading existing raw content from {raw_content_path}") |
| with open(raw_content_path, 'r') as f: |
| paper_content = json.load(f) |
| |
| images = paper_content.get('images', []) |
| tables = paper_content.get('tables', []) |
| token_log_path = raw_content_path.replace('_raw_content.json', '_parse_log.json') |
|
|
| images = paper_content.get('images', []) |
| tables = paper_content.get('tables', []) |
| figures = { |
| 'images': images, |
| 'tables': tables |
| } |
| paper_content = paper_content.get('markdown_content', "") |
| |
| |
| print("\n" + "="*50) |
| print("STEP 2: Generate project page content") |
| print("="*50) |
|
|
| planner = ProjectPageContentPlanner(agent_config_t, args) |
| figures_path = f'project_contents/{args.paper_name}_generated_filtered_figures.json' |
| generated_section_path = f'project_contents/{args.paper_name}_generated_section.json' |
| text_page_content_path = f'project_contents/{args.paper_name}_generated_text_content.json' |
| generated_content_path = f'project_contents/{args.paper_name}_generated_full_content.json' |
| if args.resume in ['parse_pdf','generate_content','full_content_check']: |
|
|
| if args.resume != 'full_content_check': |
|
|
| paper_content, figures, input_token, output_token = planner.filter_raw_content(paper_content, figures) |
| total_input_tokens_t += input_token |
| total_output_tokens_t += output_token |
| |
| generated_section, input_token, output_token = planner.section_generation(paper_content, figures) |
| total_input_tokens_t += input_token |
| total_output_tokens_t += output_token |
|
|
| text_page_content, input_token, output_token = planner.text_content_generation(paper_content, figures, generated_section) |
| total_input_tokens_t += input_token |
| total_output_tokens_t += output_token |
|
|
| else : |
| print("Skipping content generation: filter_raw_content, section_generation, text_content_generation") |
| print("Loading existing content from previous steps.") |
| paper_content = filter_references(paper_content) |
| with open(figures_path, 'r') as f: |
| figures = json.load(f) |
| with open(generated_section_path, 'r') as f: |
| generated_section = json.load(f) |
| with open(text_page_content_path, 'r') as f: |
| text_page_content = json.load(f) |
|
|
| generated_content, input_token, output_token = planner.full_content_generation(args, paper_content, figures, generated_section, text_page_content) |
| total_input_tokens_t += input_token |
| total_output_tokens_t += output_token |
|
|
| print("\n" + "="*50) |
| print("STEP 2.5: Copying Static Files") |
| print("="*50) |
| static_dir = copy_static_files(args.template_file, args.template_dir, args.output_dir, args.paper_name) |
| |
| else: |
| print("Page content is already generated, loading existing content.") |
| |
| paper_content = filter_references(paper_content) |
| with open(generated_section_path, 'r') as f: |
| generated_section = json.load(f) |
| with open(text_page_content_path, 'r') as f: |
| text_page_content = json.load(f) |
| with open(generated_content_path, 'r') as f: |
| generated_content = json.load(f) |
|
|
| static_dir = copy_static_files(args.template_file, args.template_dir, args.output_dir, args.paper_name) |
| |
| |
| print("\n" + "="*50) |
| print("STEP 3: Generating HTML Project Page") |
| print("="*50) |
| html_relative_path = os.path.relpath(args.template_file, args.template_dir) |
| html_dir = '/'.join(html_relative_path.strip().split('/')[:-1]) |
| html_generator = ProjectPageHTMLGenerator(agent_config_t,args) |
| with open(args.template_file, 'r', encoding='utf-8') as file: |
| html_template = file.read() |
| |
| if args.resume != 'modify_table' and args.resume != 'html_feedback': |
| |
| |
| assets_dir = html_generator.create_assets_directory(args, html_dir, args.output_dir) |
| |
| html_content, input_token, output_token = html_generator.generate_complete_html( |
| args, generated_content, html_dir, html_template |
| ) |
| total_input_tokens_t += input_token |
| total_output_tokens_t += output_token |
| |
| |
| html_file_path = os.path.join(args.output_dir, args.paper_name, html_dir, 'index_no_modify_table.html') |
| with open(html_file_path,'w') as file: |
| file.write(html_content) |
| run_sync_screenshots(to_url(html_file_path), os.path.join(args.output_dir,args.paper_name, html_dir,'page_final_no_modify_table.png')) |
|
|
| else: |
| print(f"skip generate_html and html_check, load html from {os.path.join(args.output_dir,args.paper_name, html_dir,'index.html')}") |
| assets_dir = os.path.join(args.output_dir, args.paper_name, html_dir,'assets') |
| with open(os.path.join(args.output_dir,args.paper_name, html_dir,'index_no_modify_table.html'),'r') as file: |
| html_content = file.read() |
| |
| if args.resume != 'html_feedback': |
| html_content ,input_token,output_token = html_generator.modify_html_table(html_content,html_dir) |
| total_input_tokens_t += input_token |
| total_output_tokens_t += output_token |
| html_file_path = os.path.join(args.output_dir, args.paper_name, html_dir, 'index_modify_table.html') |
| with open(html_file_path,'w') as file: |
| file.write(html_content) |
| |
| else: |
| print("skipping modify_table,go to html_feedback") |
| html_file_path = os.path.join(args.output_dir, args.paper_name, html_dir, 'index_modify_table.html') |
| with open(html_file_path,'r') as file: |
| html_content = file.read() |
|
|
| print('-'*50) |
| run_sync_screenshots(to_url(html_file_path), os.path.join(args.output_dir, args.paper_name, html_dir,'page_final.png')) |
| if args.human_input == '1': |
| human_feedback = input('Please view the final html in index.html,and image in page_final.png,If there are no problems, enter yes and press Enter.\n If there are any problems, please give me feedback directly.\n') |
| while human_feedback.lower() != 'yes': |
|
|
| html_content ,input_token,output_token = html_generator.modify_html_from_human_feedback(html_content,human_feedback) |
| total_input_tokens_t += input_token |
| total_output_tokens_t += output_token |
| with open(os.path.join(args.output_dir, args.paper_name, html_dir, 'index.html'),'w') as file: |
| file.write(html_content) |
| run_sync_screenshots(to_url(os.path.join(args.output_dir, args.paper_name, html_dir, 'index.html')), os.path.join(args.output_dir, args.paper_name, html_dir,'page_final.png')) |
| print('-'*50) |
| human_feedback = input('Please view the final html in index.html,and image in page_final.png,If there are no problems, enter yes and press Enter. \n If there are any problems, please give me feedback directly.\n') |
| |
| html_file_path = html_generator.save_html_file(html_content, args, html_dir,args.output_dir) |
|
|
| |
| metadata = html_generator.generate_metadata(generated_content, args) |
| metadata_path = html_generator.save_metadata(metadata, args, args.output_dir) |
| |
| |
| print("\n" + "="*50) |
| print("STEP 4: Finalizing Generation") |
| print("="*50) |
| |
| end_time = time.time() |
| time_taken = end_time - start_time |
| |
| |
| log_data = { |
| 'paper_name': paper_name, |
| 'paper_path': args.paper_path, |
| 'models': { |
| 'text_model': args.model_name_t, |
| 'vision_model': args.model_name_v |
| }, |
| 'token_usage': { |
| 'text_input_tokens': total_input_tokens_t, |
| 'text_output_tokens': total_output_tokens_t, |
| 'vision_input_tokens': total_input_tokens_v, |
| 'vision_output_tokens': total_output_tokens_v |
| }, |
| 'generation_time': time_taken, |
| 'output_files': { |
| 'html_file': html_file_path, |
| 'assets_dir': assets_dir, |
| 'static_dir': static_dir, |
| 'metadata_file': metadata_path |
| }, |
| 'content_files': { |
| 'raw_content': raw_content_path, |
| 'token_log': token_log_path |
| } |
| } |
| |
| log_path = f"{args.output_dir}/{args.paper_name}/generation_log.json" |
| with open(log_path, 'w') as f: |
| json.dump(log_data, f, indent=4) |
| |
| print(f"\nβ
Paper2ProjectPage generation completed successfully!") |
| print(f"π Output directory: {args.output_dir}/{args.paper_name}") |
| print(f"π HTML file: {html_file_path}") |
| print(f"π Assets directory: {assets_dir}") |
| print(f"π¨ Static directory: {static_dir}") |
| print(f"π Metadata file: {metadata_path}") |
| print(f"β±οΈ Total time: {time_taken:.2f} seconds") |
| print(f"π’ Token usage - Text: {total_input_tokens_t}β{total_output_tokens_t}, Vision: {total_input_tokens_v}β{total_output_tokens_v}") |
| |
| except Exception as e: |
| print(f"\nβ Error during generation: {str(e)}") |
| raise |
|
|
| if __name__ == '__main__': |
| main() |