Improve fetcher with browser-like headers and cookie handling
- Add cookie jar for session management - Include sec-ch-ua and Sec-Fetch-* headers (Chrome-like) - Add HTTPS agent with keepAlive - Log 403 response body for debugging Result: Still blocked by TLS fingerprinting - Both Ruliweb and Arcalive return "Access denied" - Need Puppeteer to bypass advanced bot detection
This commit is contained in:
@@ -1,34 +1,72 @@
|
||||
import axios, { AxiosError } from 'axios';
|
||||
import https from 'https';
|
||||
import { Logger } from './logger.js';
|
||||
import { CRAWLER_CONFIG, USER_AGENT } from '../config.js';
|
||||
|
||||
export class Fetcher {
|
||||
private static cookieJar: Map<string, string> = new Map();
|
||||
|
||||
private static async sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
private static getHostname(url: string): string {
|
||||
try {
|
||||
return new URL(url).hostname;
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
static async fetchHTML(
|
||||
url: string,
|
||||
retries: number = CRAWLER_CONFIG.maxRetries
|
||||
): Promise<string | null> {
|
||||
const hostname = this.getHostname(url);
|
||||
|
||||
for (let attempt = 1; attempt <= retries; attempt++) {
|
||||
try {
|
||||
Logger.info(`Fetching: ${url} (attempt ${attempt}/${retries})`);
|
||||
|
||||
const headers: any = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'DNT': '1',
|
||||
};
|
||||
|
||||
// 쿠키가 있으면 추가
|
||||
const cookie = this.cookieJar.get(hostname);
|
||||
if (cookie) {
|
||||
headers['Cookie'] = cookie;
|
||||
}
|
||||
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
'User-Agent': USER_AGENT,
|
||||
Accept:
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
Connection: 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
},
|
||||
headers,
|
||||
timeout: CRAWLER_CONFIG.timeout,
|
||||
maxRedirects: 5,
|
||||
httpsAgent: new https.Agent({
|
||||
rejectUnauthorized: false,
|
||||
keepAlive: true,
|
||||
}),
|
||||
});
|
||||
|
||||
// Set-Cookie 헤더 저장
|
||||
const setCookie = response.headers['set-cookie'];
|
||||
if (setCookie && setCookie.length > 0) {
|
||||
this.cookieJar.set(hostname, setCookie.join('; '));
|
||||
}
|
||||
|
||||
Logger.success(`Fetched: ${url}`);
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
@@ -39,6 +77,10 @@ export class Fetcher {
|
||||
`HTTP ${axiosError.response.status} for ${url}`,
|
||||
axiosError.message
|
||||
);
|
||||
// 403/401이면 응답 본문 확인
|
||||
if (axiosError.response.status === 403 || axiosError.response.status === 401) {
|
||||
Logger.warn(`Response body: ${String(axiosError.response.data).substring(0, 200)}`);
|
||||
}
|
||||
} else if (axiosError.request) {
|
||||
Logger.error(`No response from ${url}`, axiosError.message);
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user