Fix HTML selectors based on actual page structure

Ruliweb changes:
- Use tr.table_body instead of .table_body tr
- Correct selectors: .subject .subject_link, .writer a, .hit, .recomd, .num_reply, .time
- Remove icons/reply count from title text
- Add debug logging for parsed posts

Arcalive changes:
- Use a.vrow instead of .vrow
- Correct selectors: .col-title .title, .col-author .user-info span[data-filter]
- Selectors: .col-view, .col-rate, .col-time time
- Add debug logging for parsed posts

Both crawlers now match actual HTML structure from pages
This commit is contained in:
Claude
2025-11-15 18:23:12 +00:00
parent dd4345267a
commit bae43e4679
2 changed files with 40 additions and 22 deletions

View File

@@ -15,24 +15,34 @@ export class ArcaliveCrawler extends BaseCrawler {
const $ = cheerio.load(html); const $ = cheerio.load(html);
const posts: Post[] = []; const posts: Post[] = [];
// 아카라이브 구조에 맞게 선택자 수정 // 실제 아카라이브 HTML 구조에 맞게 수정
$('.vrow').each((index, element) => { $('a.vrow').each((index, element) => {
if (index >= CRAWLER_CONFIG.maxPostsPerBoard) return false; if (index >= CRAWLER_CONFIG.maxPostsPerBoard) return false;
try { try {
const $el = $(element); const $el = $(element);
// 공지사항 제외 // URL (a 태그의 href)
if ($el.hasClass('notice')) return; const url = $el.attr('href') || '';
const titleEl = $el.find('.title a'); // 제목
const title = this.cleanText(titleEl.text()); const title = this.cleanText($el.find('.col-title .title').text());
const url = titleEl.attr('href') || '';
const author = this.cleanText($el.find('.user-info').text()); // 작성자 (data-filter 속성이 있는 span 또는 첫 번째 span)
const views = this.parseNumber($el.find('.view-count').text()); const authorEl = $el.find('.col-author .user-info span[data-filter]');
const comments = this.parseNumber($el.find('.comment-count').text()); const author = this.cleanText(authorEl.length > 0 ? authorEl.text() : $el.find('.col-author .user-info span').first().text());
const likes = this.parseNumber($el.find('.vote-count').text());
const timeStr = this.cleanText($el.find('.time').text()); // 조회수, 추천수
const views = this.parseNumber($el.find('.col-view').text());
const likes = this.parseNumber($el.find('.col-rate').text());
// 댓글 수 (아카라이브는 제목에 포함되어 있을 수 있음, 또는 별도 요소)
const commentEl = $el.find('.col-title .comment-count');
const comments = commentEl.length > 0 ? this.parseNumber(commentEl.text()) : 0;
// 시간
const timeEl = $el.find('.col-time time');
const timeStr = timeEl.length > 0 ? this.cleanText(timeEl.text()) : '';
if (!title) return; if (!title) return;
@@ -51,6 +61,7 @@ export class ArcaliveCrawler extends BaseCrawler {
}; };
posts.push(post); posts.push(post);
Logger.info(`Parsed Arcalive post: ${title} (views: ${views}, comments: ${comments}, likes: ${likes})`);
} catch (error) { } catch (error) {
Logger.error(`Error parsing Arcalive post at index ${index}`, error); Logger.error(`Error parsing Arcalive post at index ${index}`, error);
} }

View File

@@ -15,24 +15,30 @@ export class RuliwebCrawler extends BaseCrawler {
const $ = cheerio.load(html); const $ = cheerio.load(html);
const posts: Post[] = []; const posts: Post[] = [];
// 루리웹 게시판 구조에 맞게 선택자 수정 필요 // 실제 루리웹 HTML 구조에 맞게 수정
// 실제 페이지를 확인하여 정확한 선택자를 찾아야 함 $('tr.table_body').each((index, element) => {
$('.table_body tr').each((index, element) => {
if (index >= CRAWLER_CONFIG.maxPostsPerBoard) return false; if (index >= CRAWLER_CONFIG.maxPostsPerBoard) return false;
try { try {
const $el = $(element); const $el = $(element);
// 공지사항 제외 // 제목과 URL
if ($el.find('.notice').length > 0) return; const titleEl = $el.find('.subject .subject_link');
const title = this.cleanText(titleEl.clone().children().remove().end().text()); // 아이콘/댓글 수 제외
const titleEl = $el.find('.subject a');
const title = this.cleanText(titleEl.text());
const url = titleEl.attr('href') || ''; const url = titleEl.attr('href') || '';
const author = this.cleanText($el.find('.writer').text());
// 작성자
const author = this.cleanText($el.find('.writer a').text());
// 조회수, 추천수, 댓글수
const views = this.parseNumber($el.find('.hit').text()); const views = this.parseNumber($el.find('.hit').text());
const comments = this.parseNumber($el.find('.reply_num').text());
const likes = this.parseNumber($el.find('.recomd').text()); const likes = this.parseNumber($el.find('.recomd').text());
// 댓글 수 추출 (예: "(2)" -> 2)
const commentText = $el.find('.num_reply').text();
const comments = this.parseNumber(commentText);
// 시간
const timeStr = this.cleanText($el.find('.time').text()); const timeStr = this.cleanText($el.find('.time').text());
if (!title) return; if (!title) return;
@@ -52,6 +58,7 @@ export class RuliwebCrawler extends BaseCrawler {
}; };
posts.push(post); posts.push(post);
Logger.info(`Parsed Ruliweb post: ${title} (views: ${views}, comments: ${comments}, likes: ${likes})`);
} catch (error) { } catch (error) {
Logger.error(`Error parsing Ruliweb post at index ${index}`, error); Logger.error(`Error parsing Ruliweb post at index ${index}`, error);
} }