Initial commit

This commit is contained in:
michaeltieso
2025-12-01 14:49:17 +00:00
commit 3480888eaa
92 changed files with 16631 additions and 0 deletions

348
lib/scraping/service.ts Normal file
View File

@@ -0,0 +1,348 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
export interface ScrapedData {
title: string | null;
description: string | null;
price: number | null;
currency: string | null;
imageUrl: string | null;
url: string;
}
/**
* Fetch and parse HTML from a URL
*/
async function fetchHtml(url: string): Promise<string> {
try {
const response = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
},
timeout: 10000, // 10 second timeout
});
return response.data;
} catch (error) {
throw new Error(`Failed to fetch URL: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Extract price from text (e.g., "$99.99", "99,99 €", "£50")
*/
function extractPrice(text: string): { price: number | null; currency: string | null } {
// Remove whitespace and normalize
const normalized = text.trim().replace(/\s+/g, '');
// Match currency symbols and numbers
// Patterns: $99.99, 99.99 USD, €99,99, £50.00, etc.
const patterns = [
/\$(\d+(?:,\d{3})*(?:\.\d{2})?)/, // $99.99, $1,299.99
/(\d+(?:,\d{3})*(?:\.\d{2})?)\s*USD/i, // 99.99 USD
/€(\d+(?:\.\d{3})*(?:,\d{2})?)/, // €99,99
/£(\d+(?:,\d{3})*(?:\.\d{2})?)/, // £99.99
/(\d+(?:,\d{3})*(?:\.\d{2})?)/, // Fallback: just numbers
];
for (const pattern of patterns) {
const match = normalized.match(pattern);
if (match) {
// Extract number and remove commas, convert to float
const priceStr = match[1].replace(/,/g, '');
const price = parseFloat(priceStr);
if (!isNaN(price)) {
// Determine currency
let currency = 'USD';
if (normalized.includes('$')) currency = 'USD';
else if (normalized.includes('€')) currency = 'EUR';
else if (normalized.includes('£')) currency = 'GBP';
else if (normalized.match(/USD/i)) currency = 'USD';
else if (normalized.match(/EUR/i)) currency = 'EUR';
else if (normalized.match(/GBP/i)) currency = 'GBP';
return { price, currency };
}
}
}
return { price: null, currency: null };
}
/**
* Generic scraper using Open Graph and meta tags
*/
function scrapeGeneric(html: string, url: string): ScrapedData {
const $ = cheerio.load(html);
// Try Open Graph tags first
const ogTitle = $('meta[property="og:title"]').attr('content');
const ogDescription = $('meta[property="og:description"]').attr('content');
const ogImage = $('meta[property="og:image"]').attr('content');
const ogPriceAmount = $('meta[property="og:price:amount"]').attr('content');
const ogPriceCurrency = $('meta[property="og:price:currency"]').attr('content');
// Fallback to other meta tags
const metaDescription = $('meta[name="description"]').attr('content');
const title = ogTitle || $('title').text().trim() || null;
const description = ogDescription || metaDescription || null;
const imageUrl = ogImage || $('link[rel="image_src"]').attr('href') || null;
// Try to extract price from OG tags or page content
let price: number | null = null;
let currency: string | null = null;
if (ogPriceAmount && ogPriceCurrency) {
price = parseFloat(ogPriceAmount);
currency = ogPriceCurrency;
} else {
// Try to find price in common selectors
const priceSelectors = [
'.price',
'[data-price]',
'.product-price',
'[itemprop="price"]',
'.a-price .a-offscreen', // Amazon
];
for (const selector of priceSelectors) {
const priceText = $(selector).first().text();
if (priceText) {
const extracted = extractPrice(priceText);
if (extracted.price !== null) {
price = extracted.price;
currency = extracted.currency;
break;
}
}
}
}
return {
title,
description,
price,
currency,
imageUrl,
url,
};
}
/**
* Amazon-specific scraper
*/
function scrapeAmazon(html: string, url: string): ScrapedData {
const $ = cheerio.load(html);
// Amazon-specific selectors
const title = $('#productTitle').text().trim() ||
$('span[id="productTitle"]').text().trim() ||
null;
const description = $('#feature-bullets ul li').first().text().trim() ||
$('meta[name="description"]').attr('content') ||
null;
// Amazon price selectors (they change frequently)
let price: number | null = null;
let currency: string | null = null;
const priceWhole = $('.a-price-whole').first().text().trim();
const priceFraction = $('.a-price-fraction').first().text().trim();
if (priceWhole) {
const priceStr = priceWhole.replace(',', '') + (priceFraction || '00');
price = parseFloat(priceStr);
currency = 'USD'; // Default, could be enhanced to detect from page
}
// Fallback to other price selectors
if (price === null) {
const priceSelectors = [
'.a-price .a-offscreen',
'#priceblock_ourprice',
'#priceblock_dealprice',
'.a-price-whole',
];
for (const selector of priceSelectors) {
const priceText = $(selector).first().text();
if (priceText) {
const extracted = extractPrice(priceText);
if (extracted.price !== null) {
price = extracted.price;
currency = extracted.currency;
break;
}
}
}
}
// Amazon images
const imageUrl = $('#landingImage').attr('src') ||
$('#imgBlkFront').attr('src') ||
$('img[data-old-hires]').attr('data-old-hires') ||
$('meta[property="og:image"]').attr('content') ||
null;
return {
title,
description,
price,
currency,
imageUrl,
url,
};
}
/**
* Target-specific scraper
*/
function scrapeTarget(html: string, url: string): ScrapedData {
const $ = cheerio.load(html);
const title = $('h1[data-test="product-title"]').text().trim() ||
$('meta[property="og:title"]').attr('content') ||
null;
const description = $('div[data-test="item-details-description"]').text().trim() ||
$('meta[property="og:description"]').attr('content') ||
null;
let price: number | null = null;
let currency = 'USD';
const priceText = $('span[data-test="product-price"]').first().text() ||
$('div[data-test="product-price"]').first().text();
if (priceText) {
const extracted = extractPrice(priceText);
price = extracted.price;
currency = extracted.currency || 'USD';
}
const imageUrl = $('img[data-test="product-image"]').attr('src') ||
$('meta[property="og:image"]').attr('content') ||
null;
return {
title,
description,
price,
currency,
imageUrl,
url,
};
}
/**
* Walmart-specific scraper
*/
function scrapeWalmart(html: string, url: string): ScrapedData {
const $ = cheerio.load(html);
const title = $('h1[itemprop="name"]').text().trim() ||
$('meta[property="og:title"]').attr('content') ||
null;
const description = $('div[itemprop="description"]').text().trim() ||
$('meta[property="og:description"]').attr('content') ||
null;
let price: number | null = null;
let currency = 'USD';
const priceText = $('span[itemprop="price"]').first().attr('content') ||
$('span[itemprop="price"]').first().text() ||
$('.price-characteristic').first().text();
if (priceText) {
const extracted = extractPrice(priceText);
price = extracted.price;
currency = extracted.currency || 'USD';
}
const imageUrl = $('img[itemprop="image"]').attr('src') ||
$('meta[property="og:image"]').attr('content') ||
null;
return {
title,
description,
price,
currency,
imageUrl,
url,
};
}
/**
* Best Buy-specific scraper
*/
function scrapeBestBuy(html: string, url: string): ScrapedData {
const $ = cheerio.load(html);
const title = $('h1.heading-5').first().text().trim() ||
$('meta[property="og:title"]').attr('content') ||
null;
const description = $('div.shop-product-description').first().text().trim() ||
$('meta[property="og:description"]').attr('content') ||
null;
let price: number | null = null;
let currency = 'USD';
const priceText = $('div[data-testid="customer-price"] span').first().text() ||
$('.priceView-hero-price span').first().text();
if (priceText) {
const extracted = extractPrice(priceText);
price = extracted.price;
currency = extracted.currency || 'USD';
}
const imageUrl = $('img.primary-image').first().attr('src') ||
$('meta[property="og:image"]').attr('content') ||
null;
return {
title,
description,
price,
currency,
imageUrl,
url,
};
}
/**
* Main scrape function - detects site and uses appropriate scraper
*/
export async function scrapeUrl(url: string): Promise<ScrapedData> {
try {
// Normalize URL
const normalizedUrl = url.startsWith('http') ? url : `https://${url}`;
const urlObj = new URL(normalizedUrl);
const hostname = urlObj.hostname.toLowerCase();
// Fetch HTML
const html = await fetchHtml(normalizedUrl);
// Use site-specific scraper if available
if (hostname.includes('amazon.')) {
return scrapeAmazon(html, normalizedUrl);
} else if (hostname.includes('target.com')) {
return scrapeTarget(html, normalizedUrl);
} else if (hostname.includes('walmart.com')) {
return scrapeWalmart(html, normalizedUrl);
} else if (hostname.includes('bestbuy.com')) {
return scrapeBestBuy(html, normalizedUrl);
}
// Fallback to generic scraper
return scrapeGeneric(html, normalizedUrl);
} catch (error) {
throw new Error(`Scraping failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}