diff --git a/crawler/src/crawlers/arcalive.ts b/crawler/src/crawlers/arcalive.ts index 8c88a8d..ec3a7f5 100644 --- a/crawler/src/crawlers/arcalive.ts +++ b/crawler/src/crawlers/arcalive.ts @@ -15,24 +15,34 @@ export class ArcaliveCrawler extends BaseCrawler { const $ = cheerio.load(html); const posts: Post[] = []; - // 아카라이브 구조에 맞게 선택자 수정 - $('.vrow').each((index, element) => { + // 실제 아카라이브 HTML 구조에 맞게 수정 + $('a.vrow').each((index, element) => { if (index >= CRAWLER_CONFIG.maxPostsPerBoard) return false; try { const $el = $(element); - // 공지사항 제외 - if ($el.hasClass('notice')) return; + // URL (a 태그의 href) + const url = $el.attr('href') || ''; - const titleEl = $el.find('.title a'); - const title = this.cleanText(titleEl.text()); - const url = titleEl.attr('href') || ''; - const author = this.cleanText($el.find('.user-info').text()); - const views = this.parseNumber($el.find('.view-count').text()); - const comments = this.parseNumber($el.find('.comment-count').text()); - const likes = this.parseNumber($el.find('.vote-count').text()); - const timeStr = this.cleanText($el.find('.time').text()); + // 제목 + const title = this.cleanText($el.find('.col-title .title').text()); + + // 작성자 (data-filter 속성이 있는 span 또는 첫 번째 span) + const authorEl = $el.find('.col-author .user-info span[data-filter]'); + const author = this.cleanText(authorEl.length > 0 ? authorEl.text() : $el.find('.col-author .user-info span').first().text()); + + // 조회수, 추천수 + const views = this.parseNumber($el.find('.col-view').text()); + const likes = this.parseNumber($el.find('.col-rate').text()); + + // 댓글 수 (아카라이브는 제목에 포함되어 있을 수 있음, 또는 별도 요소) + const commentEl = $el.find('.col-title .comment-count'); + const comments = commentEl.length > 0 ? this.parseNumber(commentEl.text()) : 0; + + // 시간 + const timeEl = $el.find('.col-time time'); + const timeStr = timeEl.length > 0 ? this.cleanText(timeEl.text()) : ''; if (!title) return; @@ -51,6 +61,7 @@ export class ArcaliveCrawler extends BaseCrawler { }; posts.push(post); + Logger.info(`Parsed Arcalive post: ${title} (views: ${views}, comments: ${comments}, likes: ${likes})`); } catch (error) { Logger.error(`Error parsing Arcalive post at index ${index}`, error); } diff --git a/crawler/src/crawlers/ruliweb.ts b/crawler/src/crawlers/ruliweb.ts index aceab60..a2c3913 100644 --- a/crawler/src/crawlers/ruliweb.ts +++ b/crawler/src/crawlers/ruliweb.ts @@ -15,24 +15,30 @@ export class RuliwebCrawler extends BaseCrawler { const $ = cheerio.load(html); const posts: Post[] = []; - // 루리웹 게시판 구조에 맞게 선택자 수정 필요 - // 실제 페이지를 확인하여 정확한 선택자를 찾아야 함 - $('.table_body tr').each((index, element) => { + // 실제 루리웹 HTML 구조에 맞게 수정 + $('tr.table_body').each((index, element) => { if (index >= CRAWLER_CONFIG.maxPostsPerBoard) return false; try { const $el = $(element); - // 공지사항 제외 - if ($el.find('.notice').length > 0) return; - - const titleEl = $el.find('.subject a'); - const title = this.cleanText(titleEl.text()); + // 제목과 URL + const titleEl = $el.find('.subject .subject_link'); + const title = this.cleanText(titleEl.clone().children().remove().end().text()); // 아이콘/댓글 수 제외 const url = titleEl.attr('href') || ''; - const author = this.cleanText($el.find('.writer').text()); + + // 작성자 + const author = this.cleanText($el.find('.writer a').text()); + + // 조회수, 추천수, 댓글수 const views = this.parseNumber($el.find('.hit').text()); - const comments = this.parseNumber($el.find('.reply_num').text()); const likes = this.parseNumber($el.find('.recomd').text()); + + // 댓글 수 추출 (예: "(2)" -> 2) + const commentText = $el.find('.num_reply').text(); + const comments = this.parseNumber(commentText); + + // 시간 const timeStr = this.cleanText($el.find('.time').text()); if (!title) return; @@ -52,6 +58,7 @@ export class RuliwebCrawler extends BaseCrawler { }; posts.push(post); + Logger.info(`Parsed Ruliweb post: ${title} (views: ${views}, comments: ${comments}, likes: ${likes})`); } catch (error) { Logger.error(`Error parsing Ruliweb post at index ${index}`, error); }