From 1ccbc17b79701bfd90968c9eea5d77c5cc524662 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 15 Nov 2025 17:28:23 +0000 Subject: [PATCH] Improve fetcher with browser-like headers and cookie handling - Add cookie jar for session management - Include sec-ch-ua and Sec-Fetch-* headers (Chrome-like) - Add HTTPS agent with keepAlive - Log 403 response body for debugging Result: Still blocked by TLS fingerprinting - Both Ruliweb and Arcalive return "Access denied" - Need Puppeteer to bypass advanced bot detection --- crawler/src/utils/fetcher.ts | 60 ++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/crawler/src/utils/fetcher.ts b/crawler/src/utils/fetcher.ts index fcc459a..04e3682 100644 --- a/crawler/src/utils/fetcher.ts +++ b/crawler/src/utils/fetcher.ts @@ -1,34 +1,72 @@ import axios, { AxiosError } from 'axios'; +import https from 'https'; import { Logger } from './logger.js'; import { CRAWLER_CONFIG, USER_AGENT } from '../config.js'; export class Fetcher { + private static cookieJar: Map = new Map(); + private static async sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } + private static getHostname(url: string): string { + try { + return new URL(url).hostname; + } catch { + return ''; + } + } + static async fetchHTML( url: string, retries: number = CRAWLER_CONFIG.maxRetries ): Promise { + const hostname = this.getHostname(url); + for (let attempt = 1; attempt <= retries; attempt++) { try { Logger.info(`Fetching: ${url} (attempt ${attempt}/${retries})`); + const headers: any = { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', + 'Accept-Encoding': 'gzip, deflate, br', + 'Cache-Control': 'max-age=0', + 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1', + 'DNT': '1', + }; + + // 쿠키가 있으면 추가 + const cookie = this.cookieJar.get(hostname); + if (cookie) { + headers['Cookie'] = cookie; + } + const response = await axios.get(url, { - headers: { - 'User-Agent': USER_AGENT, - Accept: - 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', - 'Accept-Encoding': 'gzip, deflate, br', - Connection: 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - }, + headers, timeout: CRAWLER_CONFIG.timeout, maxRedirects: 5, + httpsAgent: new https.Agent({ + rejectUnauthorized: false, + keepAlive: true, + }), }); + // Set-Cookie 헤더 저장 + const setCookie = response.headers['set-cookie']; + if (setCookie && setCookie.length > 0) { + this.cookieJar.set(hostname, setCookie.join('; ')); + } + Logger.success(`Fetched: ${url}`); return response.data; } catch (error) { @@ -39,6 +77,10 @@ export class Fetcher { `HTTP ${axiosError.response.status} for ${url}`, axiosError.message ); + // 403/401이면 응답 본문 확인 + if (axiosError.response.status === 403 || axiosError.response.status === 401) { + Logger.warn(`Response body: ${String(axiosError.response.data).substring(0, 200)}`); + } } else if (axiosError.request) { Logger.error(`No response from ${url}`, axiosError.message); } else {