Add HTML structure debugging script

- Create debug-html.ts to save actual HTML from pages
- Add npm run debug command to package.json
- Helps identify correct CSS selectors for crawlers
- Saves HTML to debug-ruliweb.html and debug-arcalive.html

Usage: npm run debug
This commit is contained in:
Claude
2025-11-15 17:57:35 +00:00
parent ae85dcbd87
commit dd4345267a
2 changed files with 49 additions and 1 deletions

View File

@@ -8,7 +8,8 @@
"dev": "tsx watch src/index.ts",
"build": "tsc",
"start": "node dist/index.js",
"test": "tsx src/test.ts"
"test": "tsx src/test.ts",
"debug": "tsx src/debug-html.ts"
},
"keywords": [
"crawler",

47
crawler/src/debug-html.ts Normal file
View File

@@ -0,0 +1,47 @@
import { promises as fs } from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { PuppeteerFetcher } from './utils/puppeteer-fetcher.js';
import { Logger } from './utils/logger.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
async function debugHTML() {
Logger.info('========== HTML Structure Debug ==========');
const urls = [
{ name: 'Ruliweb', url: 'https://bbs.ruliweb.com/community/board/300143' },
{ name: 'Arcalive', url: 'https://arca.live/b/breaking' },
];
for (const { name, url } of urls) {
Logger.info(`Fetching ${name}: ${url}`);
const html = await PuppeteerFetcher.fetchHTML(url);
if (html) {
// HTML을 파일로 저장
const filename = `debug-${name.toLowerCase()}.html`;
const filepath = path.join(__dirname, filename);
await fs.writeFile(filepath, html, 'utf-8');
Logger.success(`Saved HTML to ${filename} (${html.length} bytes)`);
// HTML 미리보기 (처음 500자)
Logger.info(`HTML Preview:\n${html.substring(0, 500)}...`);
} else {
Logger.error(`Failed to fetch ${name}`);
}
await PuppeteerFetcher.delay();
}
await PuppeteerFetcher.closeBrowser();
Logger.info('========== Debug completed ==========');
Logger.info('Check debug-ruliweb.html and debug-arcalive.html files');
}
debugHTML().catch((error) => {
Logger.error('Debug failed', error);
PuppeteerFetcher.closeBrowser().finally(() => process.exit(1));
});