Initial commit
This commit is contained in:
348
lib/scraping/service.ts
Normal file
348
lib/scraping/service.ts
Normal file
@@ -0,0 +1,348 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
export interface ScrapedData {
|
||||
title: string | null;
|
||||
description: string | null;
|
||||
price: number | null;
|
||||
currency: string | null;
|
||||
imageUrl: string | null;
|
||||
url: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch and parse HTML from a URL
|
||||
*/
|
||||
async function fetchHtml(url: string): Promise<string> {
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
},
|
||||
timeout: 10000, // 10 second timeout
|
||||
});
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
throw new Error(`Failed to fetch URL: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract price from text (e.g., "$99.99", "99,99 €", "£50")
|
||||
*/
|
||||
function extractPrice(text: string): { price: number | null; currency: string | null } {
|
||||
// Remove whitespace and normalize
|
||||
const normalized = text.trim().replace(/\s+/g, '');
|
||||
|
||||
// Match currency symbols and numbers
|
||||
// Patterns: $99.99, 99.99 USD, €99,99, £50.00, etc.
|
||||
const patterns = [
|
||||
/\$(\d+(?:,\d{3})*(?:\.\d{2})?)/, // $99.99, $1,299.99
|
||||
/(\d+(?:,\d{3})*(?:\.\d{2})?)\s*USD/i, // 99.99 USD
|
||||
/€(\d+(?:\.\d{3})*(?:,\d{2})?)/, // €99,99
|
||||
/£(\d+(?:,\d{3})*(?:\.\d{2})?)/, // £99.99
|
||||
/(\d+(?:,\d{3})*(?:\.\d{2})?)/, // Fallback: just numbers
|
||||
];
|
||||
|
||||
for (const pattern of patterns) {
|
||||
const match = normalized.match(pattern);
|
||||
if (match) {
|
||||
// Extract number and remove commas, convert to float
|
||||
const priceStr = match[1].replace(/,/g, '');
|
||||
const price = parseFloat(priceStr);
|
||||
|
||||
if (!isNaN(price)) {
|
||||
// Determine currency
|
||||
let currency = 'USD';
|
||||
if (normalized.includes('$')) currency = 'USD';
|
||||
else if (normalized.includes('€')) currency = 'EUR';
|
||||
else if (normalized.includes('£')) currency = 'GBP';
|
||||
else if (normalized.match(/USD/i)) currency = 'USD';
|
||||
else if (normalized.match(/EUR/i)) currency = 'EUR';
|
||||
else if (normalized.match(/GBP/i)) currency = 'GBP';
|
||||
|
||||
return { price, currency };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { price: null, currency: null };
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic scraper using Open Graph and meta tags
|
||||
*/
|
||||
function scrapeGeneric(html: string, url: string): ScrapedData {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// Try Open Graph tags first
|
||||
const ogTitle = $('meta[property="og:title"]').attr('content');
|
||||
const ogDescription = $('meta[property="og:description"]').attr('content');
|
||||
const ogImage = $('meta[property="og:image"]').attr('content');
|
||||
const ogPriceAmount = $('meta[property="og:price:amount"]').attr('content');
|
||||
const ogPriceCurrency = $('meta[property="og:price:currency"]').attr('content');
|
||||
|
||||
// Fallback to other meta tags
|
||||
const metaDescription = $('meta[name="description"]').attr('content');
|
||||
const title = ogTitle || $('title').text().trim() || null;
|
||||
const description = ogDescription || metaDescription || null;
|
||||
const imageUrl = ogImage || $('link[rel="image_src"]').attr('href') || null;
|
||||
|
||||
// Try to extract price from OG tags or page content
|
||||
let price: number | null = null;
|
||||
let currency: string | null = null;
|
||||
|
||||
if (ogPriceAmount && ogPriceCurrency) {
|
||||
price = parseFloat(ogPriceAmount);
|
||||
currency = ogPriceCurrency;
|
||||
} else {
|
||||
// Try to find price in common selectors
|
||||
const priceSelectors = [
|
||||
'.price',
|
||||
'[data-price]',
|
||||
'.product-price',
|
||||
'[itemprop="price"]',
|
||||
'.a-price .a-offscreen', // Amazon
|
||||
];
|
||||
|
||||
for (const selector of priceSelectors) {
|
||||
const priceText = $(selector).first().text();
|
||||
if (priceText) {
|
||||
const extracted = extractPrice(priceText);
|
||||
if (extracted.price !== null) {
|
||||
price = extracted.price;
|
||||
currency = extracted.currency;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
title,
|
||||
description,
|
||||
price,
|
||||
currency,
|
||||
imageUrl,
|
||||
url,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Amazon-specific scraper
|
||||
*/
|
||||
function scrapeAmazon(html: string, url: string): ScrapedData {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// Amazon-specific selectors
|
||||
const title = $('#productTitle').text().trim() ||
|
||||
$('span[id="productTitle"]').text().trim() ||
|
||||
null;
|
||||
|
||||
const description = $('#feature-bullets ul li').first().text().trim() ||
|
||||
$('meta[name="description"]').attr('content') ||
|
||||
null;
|
||||
|
||||
// Amazon price selectors (they change frequently)
|
||||
let price: number | null = null;
|
||||
let currency: string | null = null;
|
||||
|
||||
const priceWhole = $('.a-price-whole').first().text().trim();
|
||||
const priceFraction = $('.a-price-fraction').first().text().trim();
|
||||
|
||||
if (priceWhole) {
|
||||
const priceStr = priceWhole.replace(',', '') + (priceFraction || '00');
|
||||
price = parseFloat(priceStr);
|
||||
currency = 'USD'; // Default, could be enhanced to detect from page
|
||||
}
|
||||
|
||||
// Fallback to other price selectors
|
||||
if (price === null) {
|
||||
const priceSelectors = [
|
||||
'.a-price .a-offscreen',
|
||||
'#priceblock_ourprice',
|
||||
'#priceblock_dealprice',
|
||||
'.a-price-whole',
|
||||
];
|
||||
|
||||
for (const selector of priceSelectors) {
|
||||
const priceText = $(selector).first().text();
|
||||
if (priceText) {
|
||||
const extracted = extractPrice(priceText);
|
||||
if (extracted.price !== null) {
|
||||
price = extracted.price;
|
||||
currency = extracted.currency;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Amazon images
|
||||
const imageUrl = $('#landingImage').attr('src') ||
|
||||
$('#imgBlkFront').attr('src') ||
|
||||
$('img[data-old-hires]').attr('data-old-hires') ||
|
||||
$('meta[property="og:image"]').attr('content') ||
|
||||
null;
|
||||
|
||||
return {
|
||||
title,
|
||||
description,
|
||||
price,
|
||||
currency,
|
||||
imageUrl,
|
||||
url,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Target-specific scraper
|
||||
*/
|
||||
function scrapeTarget(html: string, url: string): ScrapedData {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const title = $('h1[data-test="product-title"]').text().trim() ||
|
||||
$('meta[property="og:title"]').attr('content') ||
|
||||
null;
|
||||
|
||||
const description = $('div[data-test="item-details-description"]').text().trim() ||
|
||||
$('meta[property="og:description"]').attr('content') ||
|
||||
null;
|
||||
|
||||
let price: number | null = null;
|
||||
let currency = 'USD';
|
||||
|
||||
const priceText = $('span[data-test="product-price"]').first().text() ||
|
||||
$('div[data-test="product-price"]').first().text();
|
||||
|
||||
if (priceText) {
|
||||
const extracted = extractPrice(priceText);
|
||||
price = extracted.price;
|
||||
currency = extracted.currency || 'USD';
|
||||
}
|
||||
|
||||
const imageUrl = $('img[data-test="product-image"]').attr('src') ||
|
||||
$('meta[property="og:image"]').attr('content') ||
|
||||
null;
|
||||
|
||||
return {
|
||||
title,
|
||||
description,
|
||||
price,
|
||||
currency,
|
||||
imageUrl,
|
||||
url,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Walmart-specific scraper
|
||||
*/
|
||||
function scrapeWalmart(html: string, url: string): ScrapedData {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const title = $('h1[itemprop="name"]').text().trim() ||
|
||||
$('meta[property="og:title"]').attr('content') ||
|
||||
null;
|
||||
|
||||
const description = $('div[itemprop="description"]').text().trim() ||
|
||||
$('meta[property="og:description"]').attr('content') ||
|
||||
null;
|
||||
|
||||
let price: number | null = null;
|
||||
let currency = 'USD';
|
||||
|
||||
const priceText = $('span[itemprop="price"]').first().attr('content') ||
|
||||
$('span[itemprop="price"]').first().text() ||
|
||||
$('.price-characteristic').first().text();
|
||||
|
||||
if (priceText) {
|
||||
const extracted = extractPrice(priceText);
|
||||
price = extracted.price;
|
||||
currency = extracted.currency || 'USD';
|
||||
}
|
||||
|
||||
const imageUrl = $('img[itemprop="image"]').attr('src') ||
|
||||
$('meta[property="og:image"]').attr('content') ||
|
||||
null;
|
||||
|
||||
return {
|
||||
title,
|
||||
description,
|
||||
price,
|
||||
currency,
|
||||
imageUrl,
|
||||
url,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Best Buy-specific scraper
|
||||
*/
|
||||
function scrapeBestBuy(html: string, url: string): ScrapedData {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const title = $('h1.heading-5').first().text().trim() ||
|
||||
$('meta[property="og:title"]').attr('content') ||
|
||||
null;
|
||||
|
||||
const description = $('div.shop-product-description').first().text().trim() ||
|
||||
$('meta[property="og:description"]').attr('content') ||
|
||||
null;
|
||||
|
||||
let price: number | null = null;
|
||||
let currency = 'USD';
|
||||
|
||||
const priceText = $('div[data-testid="customer-price"] span').first().text() ||
|
||||
$('.priceView-hero-price span').first().text();
|
||||
|
||||
if (priceText) {
|
||||
const extracted = extractPrice(priceText);
|
||||
price = extracted.price;
|
||||
currency = extracted.currency || 'USD';
|
||||
}
|
||||
|
||||
const imageUrl = $('img.primary-image').first().attr('src') ||
|
||||
$('meta[property="og:image"]').attr('content') ||
|
||||
null;
|
||||
|
||||
return {
|
||||
title,
|
||||
description,
|
||||
price,
|
||||
currency,
|
||||
imageUrl,
|
||||
url,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Main scrape function - detects site and uses appropriate scraper
|
||||
*/
|
||||
export async function scrapeUrl(url: string): Promise<ScrapedData> {
|
||||
try {
|
||||
// Normalize URL
|
||||
const normalizedUrl = url.startsWith('http') ? url : `https://${url}`;
|
||||
const urlObj = new URL(normalizedUrl);
|
||||
const hostname = urlObj.hostname.toLowerCase();
|
||||
|
||||
// Fetch HTML
|
||||
const html = await fetchHtml(normalizedUrl);
|
||||
|
||||
// Use site-specific scraper if available
|
||||
if (hostname.includes('amazon.')) {
|
||||
return scrapeAmazon(html, normalizedUrl);
|
||||
} else if (hostname.includes('target.com')) {
|
||||
return scrapeTarget(html, normalizedUrl);
|
||||
} else if (hostname.includes('walmart.com')) {
|
||||
return scrapeWalmart(html, normalizedUrl);
|
||||
} else if (hostname.includes('bestbuy.com')) {
|
||||
return scrapeBestBuy(html, normalizedUrl);
|
||||
}
|
||||
|
||||
// Fallback to generic scraper
|
||||
return scrapeGeneric(html, normalizedUrl);
|
||||
} catch (error) {
|
||||
throw new Error(`Scraping failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user