- Create crawler project structure - Implement base crawler class with safety features - Add crawlers for Ruliweb, Arcalive, DCInside - Implement utilities: fetcher (with retry logic), logger - Configure crawling settings (3s delay, max 20 posts/board) - Add test script and scheduler (30min intervals) Safety measures: - 3 second delay between requests - Exponential backoff retry logic - Respect robots.txt (DCInside disabled) - User-Agent and proper headers Current status: - Structure complete - Both Ruliweb and Arcalive return 403 (bot detection) - Need to decide: Puppeteer, switch targets, or use mock data
28 lines
617 B
JSON
28 lines
617 B
JSON
{
|
|
"name": "community-crawler",
|
|
"version": "1.0.0",
|
|
"description": "Korean community crawler",
|
|
"main": "dist/index.js",
|
|
"type": "module",
|
|
"scripts": {
|
|
"dev": "tsx watch src/index.ts",
|
|
"build": "tsc",
|
|
"start": "node dist/index.js",
|
|
"test": "tsx src/test.ts"
|
|
},
|
|
"keywords": ["crawler", "community", "korea"],
|
|
"author": "",
|
|
"license": "MIT",
|
|
"dependencies": {
|
|
"axios": "^1.7.9",
|
|
"cheerio": "^1.0.0",
|
|
"node-cron": "^3.0.3"
|
|
},
|
|
"devDependencies": {
|
|
"@types/node": "^22.10.2",
|
|
"@types/node-cron": "^3.0.11",
|
|
"tsx": "^4.19.2",
|
|
"typescript": "^5.7.2"
|
|
}
|
|
}
|