Add crawler implementation (Node.js + TypeScript)

- Create crawler project structure
- Implement base crawler class with safety features
- Add crawlers for Ruliweb, Arcalive, DCInside
- Implement utilities: fetcher (with retry logic), logger
- Configure crawling settings (3s delay, max 20 posts/board)
- Add test script and scheduler (30min intervals)

Safety measures:
- 3 second delay between requests
- Exponential backoff retry logic
- Respect robots.txt (DCInside disabled)
- User-Agent and proper headers

Current status:
- Structure complete
- Both Ruliweb and Arcalive return 403 (bot detection)
- Need to decide: Puppeteer, switch targets, or use mock data
This commit is contained in:
Claude
2025-11-15 17:18:09 +00:00
parent e8ca418817
commit c5ef580534
14 changed files with 1724 additions and 0 deletions

1200
crawler/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

27
crawler/package.json Normal file
View File

@@ -0,0 +1,27 @@
{
"name": "community-crawler",
"version": "1.0.0",
"description": "Korean community crawler",
"main": "dist/index.js",
"type": "module",
"scripts": {
"dev": "tsx watch src/index.ts",
"build": "tsc",
"start": "node dist/index.js",
"test": "tsx src/test.ts"
},
"keywords": ["crawler", "community", "korea"],
"author": "",
"license": "MIT",
"dependencies": {
"axios": "^1.7.9",
"cheerio": "^1.0.0",
"node-cron": "^3.0.3"
},
"devDependencies": {
"@types/node": "^22.10.2",
"@types/node-cron": "^3.0.11",
"tsx": "^4.19.2",
"typescript": "^5.7.2"
}
}

29
crawler/src/config.ts Normal file
View File

@@ -0,0 +1,29 @@
import type { CrawlerConfig, BoardConfig } from './types.js';
export const CRAWLER_CONFIG: CrawlerConfig = {
delay: 3000, // 3초 딜레이 (서버 부하 최소화)
maxRetries: 3,
timeout: 10000,
maxPostsPerBoard: 20, // 게시판당 최대 20개만 수집
};
// 루리웹 게시판 목록
export const RULIWEB_BOARDS: BoardConfig[] = [
{ name: '유머 게시판', url: 'https://bbs.ruliweb.com/community/board/300143' },
{ name: '정치 게시판', url: 'https://bbs.ruliweb.com/community/board/300148' },
];
// 아카라이브 채널 목록
export const ARCALIVE_CHANNELS: BoardConfig[] = [
{ name: '일반', url: 'https://arca.live/b/breaking' },
{ name: '유머', url: 'https://arca.live/b/humor' },
];
// 디시인사이드 갤러리 (비활성화)
export const DCINSIDE_GALLERIES: BoardConfig[] = [
// robots.txt 제한으로 비활성화
// { name: '야구갤러리', url: 'https://gall.dcinside.com/board/lists/?id=baseball_new' },
];
export const USER_AGENT =
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';

View File

@@ -0,0 +1,61 @@
import * as cheerio from 'cheerio';
import { BaseCrawler } from './base.js';
import type { Post, BoardConfig } from '../types.js';
import { Fetcher } from '../utils/fetcher.js';
import { Logger } from '../utils/logger.js';
import { CRAWLER_CONFIG } from '../config.js';
export class ArcaliveCrawler extends BaseCrawler {
protected communityName = 'Arcalive';
async crawlBoard(board: BoardConfig): Promise<Post[]> {
const html = await Fetcher.fetchHTML(board.url);
if (!html) return [];
const $ = cheerio.load(html);
const posts: Post[] = [];
// 아카라이브 구조에 맞게 선택자 수정
$('.vrow').each((index, element) => {
if (index >= CRAWLER_CONFIG.maxPostsPerBoard) return false;
try {
const $el = $(element);
// 공지사항 제외
if ($el.hasClass('notice')) return;
const titleEl = $el.find('.title a');
const title = this.cleanText(titleEl.text());
const url = titleEl.attr('href') || '';
const author = this.cleanText($el.find('.user-info').text());
const views = this.parseNumber($el.find('.view-count').text());
const comments = this.parseNumber($el.find('.comment-count').text());
const likes = this.parseNumber($el.find('.vote-count').text());
const timeStr = this.cleanText($el.find('.time').text());
if (!title) return;
const post: Post = {
id: this.generatePostId('arcalive', index),
title,
author: author || '익명',
community: 'arcalive',
board: board.name,
content: title, // 상세 내용은 가져오지 않음
views,
comments,
likes,
timestamp: this.parseRelativeTime(timeStr),
url: url.startsWith('http') ? url : `https://arca.live${url}`,
};
posts.push(post);
} catch (error) {
Logger.error(`Error parsing Arcalive post at index ${index}`, error);
}
});
return posts;
}
}

View File

@@ -0,0 +1,89 @@
import type { Post, BoardConfig } from '../types.js';
import { Fetcher } from '../utils/fetcher.js';
import { Logger } from '../utils/logger.js';
export abstract class BaseCrawler {
protected abstract communityName: string;
abstract crawlBoard(board: BoardConfig): Promise<Post[]>;
async crawlAll(boards: BoardConfig[]): Promise<Post[]> {
Logger.info(`Starting ${this.communityName} crawl...`);
const allPosts: Post[] = [];
for (const board of boards) {
try {
const posts = await this.crawlBoard(board);
allPosts.push(...posts);
Logger.success(
`Crawled ${posts.length} posts from ${this.communityName} - ${board.name}`
);
// 다음 게시판으로 넘어가기 전 딜레이
await Fetcher.delay();
} catch (error) {
Logger.error(
`Failed to crawl ${this.communityName} - ${board.name}`,
error
);
}
}
Logger.info(
`Completed ${this.communityName} crawl: ${allPosts.length} total posts`
);
return allPosts;
}
protected generatePostId(community: string, index: number): string {
return `${community}-${Date.now()}-${index}`;
}
protected parseRelativeTime(timeStr: string): string {
const now = new Date();
// "N분 전" 형식
const minutesMatch = timeStr.match(/(\d+)분\s*전/);
if (minutesMatch) {
now.setMinutes(now.getMinutes() - parseInt(minutesMatch[1]));
return now.toISOString();
}
// "N시간 전" 형식
const hoursMatch = timeStr.match(/(\d+)시간\s*전/);
if (hoursMatch) {
now.setHours(now.getHours() - parseInt(hoursMatch[1]));
return now.toISOString();
}
// "어제" 또는 "N일 전" 형식
const daysMatch = timeStr.match(/(\d+)일\s*전/);
if (daysMatch) {
now.setDate(now.getDate() - parseInt(daysMatch[1]));
return now.toISOString();
}
if (timeStr.includes('어제')) {
now.setDate(now.getDate() - 1);
return now.toISOString();
}
// 날짜 형식 (YYYY-MM-DD 또는 MM-DD)
const dateMatch = timeStr.match(/(\d{4})-(\d{2})-(\d{2})/);
if (dateMatch) {
return new Date(timeStr).toISOString();
}
// 파싱 실패 시 현재 시간 반환
return now.toISOString();
}
protected cleanText(text: string): string {
return text.trim().replace(/\s+/g, ' ');
}
protected parseNumber(text: string): number {
const cleaned = text.replace(/[^0-9]/g, '');
return cleaned ? parseInt(cleaned, 10) : 0;
}
}

View File

@@ -0,0 +1,15 @@
import { BaseCrawler } from './base.js';
import type { Post, BoardConfig } from '../types.js';
import { Logger } from '../utils/logger.js';
export class DCInsideCrawler extends BaseCrawler {
protected communityName = 'DCInside';
async crawlBoard(board: BoardConfig): Promise<Post[]> {
// robots.txt 제한으로 비활성화
Logger.warn(
`DCInside crawling is disabled due to robots.txt restrictions: ${board.name}`
);
return [];
}
}

View File

@@ -0,0 +1,62 @@
import * as cheerio from 'cheerio';
import { BaseCrawler } from './base.js';
import type { Post, BoardConfig } from '../types.js';
import { Fetcher } from '../utils/fetcher.js';
import { Logger } from '../utils/logger.js';
import { CRAWLER_CONFIG } from '../config.js';
export class RuliwebCrawler extends BaseCrawler {
protected communityName = 'Ruliweb';
async crawlBoard(board: BoardConfig): Promise<Post[]> {
const html = await Fetcher.fetchHTML(board.url);
if (!html) return [];
const $ = cheerio.load(html);
const posts: Post[] = [];
// 루리웹 게시판 구조에 맞게 선택자 수정 필요
// 실제 페이지를 확인하여 정확한 선택자를 찾아야 함
$('.table_body tr').each((index, element) => {
if (index >= CRAWLER_CONFIG.maxPostsPerBoard) return false;
try {
const $el = $(element);
// 공지사항 제외
if ($el.find('.notice').length > 0) return;
const titleEl = $el.find('.subject a');
const title = this.cleanText(titleEl.text());
const url = titleEl.attr('href') || '';
const author = this.cleanText($el.find('.writer').text());
const views = this.parseNumber($el.find('.hit').text());
const comments = this.parseNumber($el.find('.reply_num').text());
const likes = this.parseNumber($el.find('.recomd').text());
const timeStr = this.cleanText($el.find('.time').text());
if (!title) return;
const post: Post = {
id: this.generatePostId('ruliweb', index),
title,
author: author || '익명',
community: 'ruliweb',
board: board.name,
content: title, // 상세 내용은 가져오지 않음
views,
comments,
likes,
timestamp: this.parseRelativeTime(timeStr),
url: url.startsWith('http') ? url : `https://bbs.ruliweb.com${url}`,
};
posts.push(post);
} catch (error) {
Logger.error(`Error parsing Ruliweb post at index ${index}`, error);
}
});
return posts;
}
}

51
crawler/src/index.ts Normal file
View File

@@ -0,0 +1,51 @@
import cron from 'node-cron';
import { promises as fs } from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { RuliwebCrawler } from './crawlers/ruliweb.js';
import { ArcaliveCrawler } from './crawlers/arcalive.js';
import { Logger } from './utils/logger.js';
import { RULIWEB_BOARDS, ARCALIVE_CHANNELS } from './config.js';
import type { Post } from './types.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const OUTPUT_PATH = path.join(__dirname, '../../src/data/posts.json');
async function crawlAll(): Promise<void> {
Logger.info('========== Starting crawl job ==========');
const allPosts: Post[] = [];
// 루리웹 크롤링
const ruliwebCrawler = new RuliwebCrawler();
const ruliwebPosts = await ruliwebCrawler.crawlAll(RULIWEB_BOARDS);
allPosts.push(...ruliwebPosts);
// 아카라이브 크롤링
const arcaliveCrawler = new ArcaliveCrawler();
const arcalivePosts = await arcaliveCrawler.crawlAll(ARCALIVE_CHANNELS);
allPosts.push(...arcalivePosts);
// 결과 저장
try {
await fs.mkdir(path.dirname(OUTPUT_PATH), { recursive: true });
await fs.writeFile(OUTPUT_PATH, JSON.stringify(allPosts, null, 2), 'utf-8');
Logger.success(`Saved ${allPosts.length} posts to ${OUTPUT_PATH}`);
} catch (error) {
Logger.error('Failed to save posts', error);
}
Logger.info('========== Crawl job completed ==========');
}
// 즉시 한 번 실행
await crawlAll();
// 30분마다 실행
cron.schedule('*/30 * * * *', async () => {
await crawlAll();
});
Logger.info('Crawler scheduler started. Running every 30 minutes.');

62
crawler/src/test.ts Normal file
View File

@@ -0,0 +1,62 @@
import { promises as fs } from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { RuliwebCrawler } from './crawlers/ruliweb.js';
import { ArcaliveCrawler } from './crawlers/arcalive.js';
import { Logger } from './utils/logger.js';
import { RULIWEB_BOARDS, ARCALIVE_CHANNELS } from './config.js';
import type { Post } from './types.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const OUTPUT_PATH = path.join(__dirname, '../../src/data/posts.json');
async function test(): Promise<void> {
Logger.info('========== Starting test crawl ==========');
const allPosts: Post[] = [];
// 루리웹 테스트
Logger.info('Testing Ruliweb crawler...');
const ruliwebCrawler = new RuliwebCrawler();
const ruliwebPosts = await ruliwebCrawler.crawlAll(RULIWEB_BOARDS);
allPosts.push(...ruliwebPosts);
Logger.info(`Ruliweb: ${ruliwebPosts.length} posts`);
// 아카라이브 테스트
Logger.info('Testing Arcalive crawler...');
const arcaliveCrawler = new ArcaliveCrawler();
const arcalivePosts = await arcaliveCrawler.crawlAll(ARCALIVE_CHANNELS);
allPosts.push(...arcalivePosts);
Logger.info(`Arcalive: ${arcalivePosts.length} posts`);
// 결과 출력
Logger.info(`Total posts collected: ${allPosts.length}`);
if (allPosts.length > 0) {
Logger.info('Sample posts:');
allPosts.slice(0, 3).forEach((post, index) => {
console.log(`\n--- Post ${index + 1} ---`);
console.log(`Title: ${post.title}`);
console.log(`Author: ${post.author}`);
console.log(`Community: ${post.community}`);
console.log(`Board: ${post.board}`);
console.log(`Views: ${post.views}, Comments: ${post.comments}, Likes: ${post.likes}`);
console.log(`URL: ${post.url}`);
});
}
// 결과 저장
try {
await fs.mkdir(path.dirname(OUTPUT_PATH), { recursive: true });
await fs.writeFile(OUTPUT_PATH, JSON.stringify(allPosts, null, 2), 'utf-8');
Logger.success(`Saved ${allPosts.length} posts to ${OUTPUT_PATH}`);
} catch (error) {
Logger.error('Failed to save posts', error);
}
Logger.info('========== Test crawl completed ==========');
}
test();

27
crawler/src/types.ts Normal file
View File

@@ -0,0 +1,27 @@
export type CommunityType = 'dcinside' | 'ruliweb' | 'arcalive';
export interface Post {
id: string;
title: string;
author: string;
community: CommunityType;
board: string;
content: string;
views: number;
comments: number;
likes: number;
timestamp: string;
url: string;
}
export interface CrawlerConfig {
delay: number;
maxRetries: number;
timeout: number;
maxPostsPerBoard: number;
}
export interface BoardConfig {
name: string;
url: string;
}

View File

@@ -0,0 +1,63 @@
import axios, { AxiosError } from 'axios';
import { Logger } from './logger.js';
import { CRAWLER_CONFIG, USER_AGENT } from '../config.js';
export class Fetcher {
private static async sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
static async fetchHTML(
url: string,
retries: number = CRAWLER_CONFIG.maxRetries
): Promise<string | null> {
for (let attempt = 1; attempt <= retries; attempt++) {
try {
Logger.info(`Fetching: ${url} (attempt ${attempt}/${retries})`);
const response = await axios.get(url, {
headers: {
'User-Agent': USER_AGENT,
Accept:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
Connection: 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
timeout: CRAWLER_CONFIG.timeout,
maxRedirects: 5,
});
Logger.success(`Fetched: ${url}`);
return response.data;
} catch (error) {
const axiosError = error as AxiosError;
if (axiosError.response) {
Logger.error(
`HTTP ${axiosError.response.status} for ${url}`,
axiosError.message
);
} else if (axiosError.request) {
Logger.error(`No response from ${url}`, axiosError.message);
} else {
Logger.error(`Error fetching ${url}`, axiosError.message);
}
if (attempt < retries) {
const backoffDelay = CRAWLER_CONFIG.delay * Math.pow(2, attempt - 1);
Logger.warn(`Retrying after ${backoffDelay}ms...`);
await this.sleep(backoffDelay);
}
}
}
Logger.error(`Failed to fetch ${url} after ${retries} attempts`);
return null;
}
static async delay(): Promise<void> {
await this.sleep(CRAWLER_CONFIG.delay);
}
}

View File

@@ -0,0 +1,21 @@
export class Logger {
private static getTimestamp(): string {
return new Date().toISOString();
}
static info(message: string, ...args: any[]): void {
console.log(`[${this.getTimestamp()}] [INFO]`, message, ...args);
}
static error(message: string, ...args: any[]): void {
console.error(`[${this.getTimestamp()}] [ERROR]`, message, ...args);
}
static warn(message: string, ...args: any[]): void {
console.warn(`[${this.getTimestamp()}] [WARN]`, message, ...args);
}
static success(message: string, ...args: any[]): void {
console.log(`[${this.getTimestamp()}] [SUCCESS]`, message, ...args);
}
}

16
crawler/tsconfig.json Normal file
View File

@@ -0,0 +1,16 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "ESNext",
"moduleResolution": "node",
"esModuleInterop": true,
"strict": true,
"skipLibCheck": true,
"outDir": "./dist",
"rootDir": "./src",
"resolveJsonModule": true,
"declaration": true
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist"]
}

1
src/data/posts.json Normal file
View File

@@ -0,0 +1 @@
[]