| #!/usr/bin/env node |
|
|
| import { execSync } from 'child_process'; |
| import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs'; |
| import { join, dirname, basename } from 'path'; |
| import { fileURLToPath } from 'url'; |
| import { cleanBibliography } from './bib-cleaner.mjs'; |
| import { postProcessMarkdown } from './post-processor.mjs'; |
| import { preprocessLatexReferences } from './reference-preprocessor.mjs'; |
|
|
| const __filename = fileURLToPath(import.meta.url); |
| const __dirname = dirname(__filename); |
|
|
| |
| const DEFAULT_INPUT = join(__dirname, 'input', 'main.tex'); |
| const DEFAULT_OUTPUT = join(__dirname, 'output'); |
|
|
| function parseArgs() { |
| const args = process.argv.slice(2); |
| const config = { |
| input: DEFAULT_INPUT, |
| output: DEFAULT_OUTPUT, |
| clean: false |
| }; |
|
|
| for (const arg of args) { |
| if (arg.startsWith('--input=')) { |
| config.input = arg.split('=')[1]; |
| } else if (arg.startsWith('--output=')) { |
| config.output = arg.split('=')[1]; |
| } else if (arg === '--clean') { |
| config.clean = true; |
| } |
| } |
|
|
| return config; |
| } |
|
|
| function ensureDirectory(dir) { |
| if (!existsSync(dir)) { |
| mkdirSync(dir, { recursive: true }); |
| } |
| } |
|
|
| function cleanDirectory(dir) { |
| if (existsSync(dir)) { |
| execSync(`rm -rf "${dir}"/*`, { stdio: 'inherit' }); |
| } |
| } |
|
|
| function preprocessLatexFile(inputFile, outputDir) { |
| const inputDir = dirname(inputFile); |
| const tempFile = join(outputDir, 'temp_main.tex'); |
|
|
| console.log('π Preprocessing LaTeX file to resolve \\input commands...'); |
|
|
| let content = readFileSync(inputFile, 'utf8'); |
|
|
| |
| console.log('π§Ή Cleaning problematic LaTeX constructs...'); |
|
|
| |
| content = content.replace(/\$p_0\$(?![A-Za-z])/g, 'p0'); |
|
|
| |
| content = content.replace(/\$\$\\begin\{equation\*\}/g, '$$'); |
| content = content.replace(/\\end\{equation\*\}\$\$/g, '$$'); |
| content = content.replace(/\\begin\{equation\*\}/g, '$$'); |
| content = content.replace(/\\end\{equation\*\}/g, '$$'); |
| |
| |
| const alignBlocks = []; |
| content = content.replace(/\\begin\{align\}([\s\S]*?)\\end\{align\}/g, (match, alignContent) => { |
| alignBlocks.push(match); |
| return `__ALIGN_BLOCK_${alignBlocks.length - 1}__`; |
| }); |
|
|
| |
| content = content.replace(/&=/g, '='); |
| content = content.replace(/&/g, ''); |
|
|
| |
| alignBlocks.forEach((block, index) => { |
| content = content.replace(`__ALIGN_BLOCK_${index}__`, block); |
| }); |
|
|
| |
| content = content.replace(/\\cite[tp]?\{([^}]+)\}/g, (match, citations) => { |
| |
| return citations.split(',').map(cite => `@${cite.trim()}`).join(', '); |
| }); |
|
|
| |
| content = content.replace(/\\textsc\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, content_inside, offset) => { |
| |
| const before = content.substring(Math.max(0, offset - 50), offset); |
| if (before.includes('\\newcommand') || before.includes('\\renewcommand') || before.includes('\\def')) { |
| return match; |
| } |
|
|
| |
| const simplified = content_inside.replace(/\\\([^)]+\\\)/g, 'MATHEXPR'); |
| return `\\text{${simplified}}`; |
| }); |
|
|
| |
| content = content.replace(/\\input\{snippets\/[^}]+\}/g, '% Code snippet removed'); |
|
|
| |
| const inputRegex = /^([^%]*?)\\input\{([^}]+)\}/gm; |
| let match; |
|
|
| while ((match = inputRegex.exec(content)) !== null) { |
| const beforeInput = match[1]; |
| const inputPath = match[2]; |
|
|
| |
| if (beforeInput.includes('%')) { |
| continue; |
| } |
| let fullPath; |
|
|
| |
| if (inputPath.includes('snippets/')) { |
| console.log(` Skipping: ${inputPath}`); |
| content = content.replace(`\\input{${inputPath}}`, `% Skipped: ${inputPath}`); |
| continue; |
| } |
|
|
| |
| if (inputPath.endsWith('.tex')) { |
| fullPath = join(inputDir, inputPath); |
| } else { |
| fullPath = join(inputDir, inputPath + '.tex'); |
| } |
|
|
| if (existsSync(fullPath)) { |
| console.log(` Including: ${inputPath}`); |
| let includedContent = readFileSync(fullPath, 'utf8'); |
|
|
| |
| includedContent = includedContent.replace(/\$p_0\$/g, 'p0'); |
| includedContent = includedContent.replace(/\\input\{snippets\/[^}]+\}/g, '% Code snippet removed'); |
|
|
| |
| includedContent = includedContent.replace(/\\textsc\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, content_inside, offset) => { |
| |
| const before = includedContent.substring(Math.max(0, offset - 50), offset); |
| if (before.includes('\\newcommand') || before.includes('\\renewcommand') || before.includes('\\def')) { |
| return match; |
| } |
|
|
| const simplified = content_inside.replace(/\\\([^)]+\\\)/g, 'MATHEXPR'); |
| return `\\text{${simplified}}`; |
| }); |
|
|
| |
| const alignBlocksIncluded = []; |
| includedContent = includedContent.replace(/\\begin\{align\}([\s\S]*?)\\end\{align\}/g, (match, alignContent) => { |
| alignBlocksIncluded.push(match); |
| return `__ALIGN_BLOCK_${alignBlocksIncluded.length - 1}__`; |
| }); |
|
|
| |
| includedContent = includedContent.replace(/&=/g, '='); |
| includedContent = includedContent.replace(/&/g, ''); |
|
|
| |
| alignBlocksIncluded.forEach((block, index) => { |
| includedContent = includedContent.replace(`__ALIGN_BLOCK_${index}__`, block); |
| }); |
|
|
| |
| includedContent = includedContent.replace(/\$\$\\begin\{equation\*\}/g, '$$'); |
| includedContent = includedContent.replace(/\\end\{equation\*\}\$\$/g, '$$'); |
| includedContent = includedContent.replace(/\\begin\{equation\*\}/g, '$$'); |
| includedContent = includedContent.replace(/\\end\{equation\*\}/g, '$$'); |
|
|
| |
| includedContent = includedContent.replace(/\\cite[tp]?\{([^}]+)\}/g, (match, citations) => { |
| return citations.split(',').map(cite => `@${cite.trim()}`).join(', '); |
| }); |
|
|
| content = content.replace(`\\input{${inputPath}}`, includedContent); |
| } else { |
| console.log(` β οΈ File not found: ${fullPath} (skipping)`); |
| content = content.replace(`\\input{${inputPath}}`, `% File not found: ${inputPath}`); |
| } |
| } |
|
|
| |
| console.log('π§ Preprocessing LaTeX references for MDX compatibility...'); |
| const referenceResult = preprocessLatexReferences(content); |
| content = referenceResult.content; |
|
|
| |
| writeFileSync(tempFile, content); |
| return tempFile; |
| } |
|
|
| function processBibliography(inputFile, outputDir) { |
| const bibFile = join(dirname(inputFile), 'main.bib'); |
| const outputBibFile = join(outputDir, 'main.bib'); |
|
|
| if (!existsSync(bibFile)) { |
| console.log(' β οΈ No bibliography file found'); |
| return null; |
| } |
|
|
| const success = cleanBibliography(bibFile, outputBibFile); |
| return success ? outputBibFile : null; |
| } |
|
|
| export function convertLatexToMarkdown(inputFile, outputDir) { |
| console.log('π Simple LaTeX to Markdown Converter'); |
| console.log(`π Input: ${inputFile}`); |
| console.log(`π Output: ${outputDir}`); |
|
|
| |
| if (!existsSync(inputFile)) { |
| console.error(`β Input file not found: ${inputFile}`); |
| process.exit(1); |
| } |
|
|
| |
| ensureDirectory(outputDir); |
|
|
| try { |
| |
| execSync('pandoc --version', { stdio: 'pipe' }); |
| } catch (error) { |
| console.error('β Pandoc not found. Please install it: brew install pandoc'); |
| process.exit(1); |
| } |
|
|
| |
| const cleanBibFile = processBibliography(inputFile, outputDir); |
|
|
| |
| const preprocessedFile = preprocessLatexFile(inputFile, outputDir); |
|
|
| const inputFileName = basename(inputFile, '.tex'); |
| const outputFile = join(outputDir, `${inputFileName}.md`); |
|
|
| try { |
| console.log('π Converting with Pandoc...'); |
|
|
| |
| const bibOption = cleanBibFile ? `--bibliography="${cleanBibFile}"` : ''; |
|
|
| |
| const mediaDir = join(outputDir, 'assets', 'image'); |
| ensureDirectory(mediaDir); |
| const inputDir = dirname(inputFile); |
| const equationFilterPath = join(__dirname, 'filters', 'equation-ids.lua'); |
| const pandocCommand = `pandoc "${preprocessedFile}" -f latex+latex_macros -t gfm+tex_math_dollars+raw_html --shift-heading-level-by=1 --wrap=none ${bibOption} --extract-media="${mediaDir}" --resource-path="${inputDir}" --lua-filter="${equationFilterPath}" -o "${outputFile}"`; |
|
|
| console.log(` Running: ${pandocCommand}`); |
| execSync(pandocCommand, { stdio: 'pipe' }); |
|
|
| |
| execSync(`rm "${preprocessedFile}"`, { stdio: 'pipe' }); |
|
|
| |
| let markdownContent = readFileSync(outputFile, 'utf8'); |
|
|
| |
| markdownContent = postProcessMarkdown(markdownContent, inputDir); |
|
|
| writeFileSync(outputFile, markdownContent); |
|
|
| console.log(`β
Conversion completed: ${outputFile}`); |
|
|
| |
| const stats = execSync(`wc -l "${outputFile}"`, { encoding: 'utf8' }); |
| const lines = stats.trim().split(' ')[0]; |
| console.log(`π Result: ${lines} lines written`); |
|
|
| } catch (error) { |
| console.error('β Pandoc conversion failed:'); |
| console.error(error.message); |
| |
| try { |
| execSync(`rm "${preprocessedFile}"`, { stdio: 'pipe' }); |
| } catch { } |
| process.exit(1); |
| } |
| } |
|
|
| function main() { |
| const config = parseArgs(); |
|
|
| if (config.clean) { |
| console.log('π§Ή Cleaning output directory...'); |
| cleanDirectory(config.output); |
| } |
|
|
| convertLatexToMarkdown(config.input, config.output); |
|
|
| console.log('π Simple conversion completed!'); |
| } |
|
|
| |
| if (process.argv.includes('--help') || process.argv.includes('-h')) { |
| console.log(` |
| π Simple LaTeX to Markdown Converter |
| |
| Usage: |
| node scripts/simple-latex-to-markdown.mjs [options] |
| |
| Options: |
| --input=PATH Input LaTeX file (default: latex-converter/input-example/main.tex) |
| --output=PATH Output directory (default: output/) |
| --clean Clean output directory before conversion |
| --help, -h Show this help |
| |
| Examples: |
| # Basic conversion |
| node scripts/simple-latex-to-markdown.mjs |
| |
| # Custom paths |
| node scripts/simple-latex-to-markdown.mjs --input=my-paper.tex --output=converted/ |
| |
| # Clean output first |
| node scripts/simple-latex-to-markdown.mjs --clean |
| `); |
| process.exit(0); |
| } |
|
|
| main(); |
|
|