Advanced Data Extraction Techniques: Beyond Basic Web Scraping
While basic web scraping can collect data, advanced data extraction transforms raw content into structured, actionable insights. This comprehensive guide explores sophisticated techniques for maximizing the value of extracted web content using Zapserp.
Understanding Content Structure and Metadata
Modern web content extraction goes beyond simple text scraping. It involves understanding document structure, extracting meaningful metadata, and applying intelligent filtering to ensure data quality.
Leveraging Rich Metadata
Zapserp's content extraction provides rich metadata that many developers underutilize. Here's how to make the most of it:
import { Zapserp, Page, PageMetadata } from 'zapserp'
interface EnrichedContent {
page: Page
analysis: {
contentQuality: number
topicRelevance: number
authorityScore: number
freshness: number
readability: number
}
extracted: {
entities: string[]
topics: string[]
sentiment: 'positive' | 'negative' | 'neutral'
keyPhrases: string[]
}
}
class AdvancedContentAnalyzer {
private zapserp: Zapserp
constructor(apiKey: string) {
this.zapserp = new Zapserp({ apiKey })
}
async analyzeContent(url: string): Promise<EnrichedContent> {
const page = await this.zapserp.reader({ url })
const analysis = this.analyzeQuality(page)
const extracted = this.extractInsights(page)
return {
page,
analysis,
extracted
}
}
private analyzeQuality(page: Page): any {
const analysis = {
contentQuality: 0,
topicRelevance: 0,
authorityScore: 0,
freshness: 0,
readability: 0
}
// Content Quality Score (0-100)
analysis.contentQuality = this.calculateContentQuality(page)
// Authority Score based on metadata and content depth
analysis.authorityScore = this.calculateAuthorityScore(page)
// Freshness based on publication date
analysis.freshness = this.calculateFreshness(page.metadata?.publishedTime)
// Readability score
analysis.readability = this.calculateReadability(page.content)
return analysis
}
private calculateContentQuality(page: Page): number {
let score = 0
// Content length factor (optimal range: 800-3000 words)
const wordCount = page.content.split(/\s+/).length
if (wordCount >= 800 && wordCount <= 3000) {
score += 30
} else if (wordCount >= 500) {
score += 20
} else if (wordCount >= 300) {
score += 10
}
// Structure indicators
const hasHeaders = /#{1,6}\s/.test(page.content) || /<h[1-6]/.test(page.content)
if (hasHeaders) score += 15
// Paragraph structure
const paragraphCount = page.content.split('\n\n').length
if (paragraphCount >= 3) score += 15
// Metadata completeness
if (page.metadata) {
if (page.metadata.description) score += 10
if (page.metadata.author) score += 10
if (page.metadata.publishedTime) score += 10
if (page.metadata.keywords) score += 10
}
return Math.min(score, 100)
}
private calculateAuthorityScore(page: Page): number {
let score = 0
// Domain authority indicators
const domain = this.extractDomain(page.url)
const authorityDomains = [
'wikipedia.org', 'stackoverflow.com', 'github.com',
'medium.com', 'dev.to', 'arxiv.org'
]
if (authorityDomains.some(d => domain.includes(d))) {
score += 40
}
// Content depth indicators
const content = page.content.toLowerCase()
// Technical depth
const technicalTerms = [
'algorithm', 'implementation', 'methodology', 'analysis',
'research', 'study', 'framework', 'architecture'
]
const techTermCount = technicalTerms.filter(term =>
content.includes(term)
).length
score += Math.min(techTermCount * 5, 30)
// Citation indicators
const citationIndicators = [
'according to', 'research shows', 'study found',
'data indicates', 'source:', 'reference'
]
const citationCount = citationIndicators.filter(indicator =>
content.includes(indicator)
).length
score += Math.min(citationCount * 5, 30)
return Math.min(score, 100)
}
private calculateFreshness(publishedTime?: string): number {
if (!publishedTime) return 0
const published = new Date(publishedTime)
const now = new Date()
const daysDiff = (now.getTime() - published.getTime()) / (1000 * 60 * 60 * 24)
if (daysDiff <= 7) return 100 // Within a week
if (daysDiff <= 30) return 80 // Within a month
if (daysDiff <= 90) return 60 // Within 3 months
if (daysDiff <= 365) return 40 // Within a year
if (daysDiff <= 730) return 20 // Within 2 years
return 10 // Older than 2 years
}
private calculateReadability(content: string): number {
// Simplified Flesch Reading Ease calculation
const sentences = content.split(/[.!?]+/).length
const words = content.split(/\s+/).length
const syllables = this.countSyllables(content)
if (sentences === 0 || words === 0) return 0
const avgWordsPerSentence = words / sentences
const avgSyllablesPerWord = syllables / words
const fleschScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord)
// Convert to 0-100 scale
return Math.max(0, Math.min(100, fleschScore))
}
private countSyllables(text: string): number {
// Simple syllable counting heuristic
const words = text.toLowerCase().match(/\b[a-z]+\b/g) || []
let totalSyllables = 0
for (const word of words) {
const vowelGroups = word.match(/[aeiou]+/g) || []
let syllables = vowelGroups.length
// Adjust for silent 'e'
if (word.endsWith('e') && syllables > 1) {
syllables--
}
// Minimum of 1 syllable per word
syllables = Math.max(1, syllables)
totalSyllables += syllables
}
return totalSyllables
}
private extractInsights(page: Page): any {
return {
entities: this.extractEntities(page.content),
topics: this.extractTopics(page.content),
sentiment: this.analyzeSentiment(page.content),
keyPhrases: this.extractKeyPhrases(page.content)
}
}
private extractEntities(content: string): string[] {
// Simple named entity extraction using patterns
const entities = new Set<string>()
// Company names (capitalized words ending with common suffixes)
const companyPattern = /\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Inc|Corp|LLC|Ltd|Company|Technologies|Systems|Solutions)\b/g
const companies = content.match(companyPattern) || []
companies.forEach(company => entities.add(company))
// URLs and domains
const urlPattern = /(?:https?:\/\/)?(?:www\.)?([a-zA-Z0-9-]+\.[a-zA-Z]{2,})/g
const urls = content.match(urlPattern) || []
urls.forEach(url => entities.add(url))
// Email addresses
const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g
const emails = content.match(emailPattern) || []
emails.forEach(email => entities.add(email))
// Dates
const datePattern = /\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b/g
const dates = content.match(datePattern) || []
dates.forEach(date => entities.add(date))
return Array.from(entities).slice(0, 20) // Limit to top 20
}
private extractTopics(content: string): string[] {
// Topic extraction using keyword frequency and clustering
const text = content.toLowerCase()
const words = text.match(/\b[a-z]{3,}\b/g) || []
// Remove common stop words
const stopWords = new Set([
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all',
'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day',
'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new',
'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man',
'way', 'she', 'may', 'use', 'say', 'each', 'which', 'their'
])
const filteredWords = words.filter(word =>
!stopWords.has(word) && word.length > 3
)
// Count word frequency
const wordCount = new Map<string, number>()
filteredWords.forEach(word => {
wordCount.set(word, (wordCount.get(word) || 0) + 1)
})
// Get top keywords by frequency
const topWords = Array.from(wordCount.entries())
.sort(([,a], [,b]) => b - a)
.slice(0, 10)
.map(([word]) => word)
return topWords
}
private analyzeSentiment(content: string): 'positive' | 'negative' | 'neutral' {
// Simple sentiment analysis using word lists
const positiveWords = [
'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic',
'positive', 'success', 'improve', 'benefit', 'advantage', 'effective'
]
const negativeWords = [
'bad', 'terrible', 'awful', 'horrible', 'negative', 'problem',
'issue', 'fail', 'error', 'wrong', 'difficult', 'challenge'
]
const text = content.toLowerCase()
const words = text.match(/\b[a-z]+\b/g) || []
let positiveScore = 0
let negativeScore = 0
words.forEach(word => {
if (positiveWords.includes(word)) positiveScore++
if (negativeWords.includes(word)) negativeScore++
})
const threshold = Math.max(words.length * 0.01, 2) // 1% of words or minimum 2
if (positiveScore > negativeScore + threshold) return 'positive'
if (negativeScore > positiveScore + threshold) return 'negative'
return 'neutral'
}
private extractKeyPhrases(content: string): string[] {
// Extract 2-3 word phrases that appear multiple times
const phrases = new Set<string>()
const words = content.toLowerCase().match(/\b[a-z]+\b/g) || []
// Extract 2-word phrases
for (let i = 0; i < words.length - 1; i++) {
const phrase = `${words[i]} ${words[i + 1]}`
if (phrase.length > 6) { // Minimum phrase length
phrases.add(phrase)
}
}
// Extract 3-word phrases
for (let i = 0; i < words.length - 2; i++) {
const phrase = `${words[i]} ${words[i + 1]} ${words[i + 2]}`
if (phrase.length > 10) { // Minimum phrase length
phrases.add(phrase)
}
}
// Count phrase frequency
const phraseCount = new Map<string, number>()
const text = content.toLowerCase()
phrases.forEach(phrase => {
const count = (text.match(new RegExp(phrase, 'g')) || []).length
if (count > 1) { // Only keep phrases that appear multiple times
phraseCount.set(phrase, count)
}
})
// Return top phrases by frequency
return Array.from(phraseCount.entries())
.sort(([,a], [,b]) => b - a)
.slice(0, 10)
.map(([phrase]) => phrase)
}
private extractDomain(url: string): string {
try {
return new URL(url).hostname.replace('www.', '')
} catch {
return 'unknown'
}
}
}
// Usage Example
const analyzer = new AdvancedContentAnalyzer('YOUR_API_KEY')
const analysis = await analyzer.analyzeContent('https://example.com/article')
console.log('Content Analysis:', {
quality: analysis.analysis.contentQuality,
authority: analysis.analysis.authorityScore,
freshness: analysis.analysis.freshness,
readability: analysis.analysis.readability,
entities: analysis.extracted.entities.slice(0, 5),
topics: analysis.extracted.topics.slice(0, 5),
sentiment: analysis.extracted.sentiment
})
Batch Processing and Data Pipeline Optimization
For large-scale data extraction, building efficient pipelines is crucial. Here's how to process thousands of URLs efficiently:
interface ProcessingJob {
id: string
urls: string[]
status: 'pending' | 'processing' | 'completed' | 'failed'
results: EnrichedContent[]
startTime?: Date
endTime?: Date
errorCount: number
}
class DataExtractionPipeline {
private zapserp: Zapserp
private analyzer: AdvancedContentAnalyzer
private concurrencyLimit: number
private retryAttempts: number
constructor(apiKey: string, options: {
concurrencyLimit?: number
retryAttempts?: number
} = {}) {
this.zapserp = new Zapserp({ apiKey })
this.analyzer = new AdvancedContentAnalyzer(apiKey)
this.concurrencyLimit = options.concurrencyLimit || 5
this.retryAttempts = options.retryAttempts || 3
}
async processUrlBatch(urls: string[], jobId: string): Promise<ProcessingJob> {
const job: ProcessingJob = {
id: jobId,
urls,
status: 'processing',
results: [],
startTime: new Date(),
errorCount: 0
}
try {
// Split URLs into manageable chunks
const chunks = this.chunkArray(urls, 10) // 10 URLs per batch
for (let i = 0; i < chunks.length; i += this.concurrencyLimit) {
const currentChunks = chunks.slice(i, i + this.concurrencyLimit)
// Process chunks concurrently
const chunkPromises = currentChunks.map(chunk =>
this.processBatchWithRetry(chunk)
)
const chunkResults = await Promise.allSettled(chunkPromises)
// Aggregate results
chunkResults.forEach(result => {
if (result.status === 'fulfilled') {
job.results.push(...result.value.results)
job.errorCount += result.value.errors
} else {
job.errorCount += currentChunks[0].length // Assume all failed
}
})
// Progress reporting
const processed = (i + this.concurrencyLimit) * 10
const total = urls.length
console.log(`Progress: ${Math.min(processed, total)}/${total} URLs processed`)
// Rate limiting
if (i + this.concurrencyLimit < chunks.length) {
await this.sleep(200) // 200ms delay between chunk groups
}
}
job.status = 'completed'
job.endTime = new Date()
} catch (error) {
job.status = 'failed'
job.endTime = new Date()
console.error('Pipeline failed:', error)
}
return job
}
private async processBatchWithRetry(urls: string[], attempt: number = 1): Promise<{
results: EnrichedContent[]
errors: number
}> {
try {
const batchResponse = await this.zapserp.readerBatch({ urls })
const results: EnrichedContent[] = []
let errors = 0
// Analyze each extracted page
for (const page of batchResponse.results) {
try {
if (page && page.content && page.content.length > 100) {
const analysis = await this.analyzer.analyzeContent(page.url)
results.push(analysis)
} else {
errors++
}
} catch (error) {
console.error(`Analysis failed for ${page?.url}:`, error)
errors++
}
}
return { results, errors }
} catch (error) {
if (attempt < this.retryAttempts) {
console.log(`Batch failed, retrying (${attempt}/${this.retryAttempts})...`)
await this.sleep(1000 * attempt) // Exponential backoff
return this.processBatchWithRetry(urls, attempt + 1)
}
throw error
}
}
private chunkArray<T>(array: T[], chunkSize: number): T[][] {
const chunks: T[][] = []
for (let i = 0; i < array.length; i += chunkSize) {
chunks.push(array.slice(i, i + chunkSize))
}
return chunks
}
private sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms))
}
async generateReport(job: ProcessingJob): Promise<any> {
if (job.status !== 'completed') {
throw new Error('Job not completed')
}
const processingTime = job.endTime!.getTime() - job.startTime!.getTime()
const successRate = ((job.results.length) / job.urls.length) * 100
// Aggregate quality metrics
const qualityScores = job.results.map(r => r.analysis.contentQuality)
const avgQuality = qualityScores.reduce((a, b) => a + b, 0) / qualityScores.length
// Content type distribution
const sentimentDistribution = job.results.reduce((acc, result) => {
acc[result.extracted.sentiment]++
return acc
}, { positive: 0, negative: 0, neutral: 0 })
// Top topics across all content
const allTopics = job.results.flatMap(r => r.extracted.topics)
const topicCount = new Map<string, number>()
allTopics.forEach(topic => {
topicCount.set(topic, (topicCount.get(topic) || 0) + 1)
})
const topTopics = Array.from(topicCount.entries())
.sort(([,a], [,b]) => b - a)
.slice(0, 10)
.map(([topic, count]) => ({ topic, count }))
return {
summary: {
totalUrls: job.urls.length,
successfulExtractions: job.results.length,
failedExtractions: job.errorCount,
successRate: Math.round(successRate * 100) / 100,
processingTimeMs: processingTime,
avgProcessingTimePerUrl: Math.round(processingTime / job.urls.length)
},
quality: {
averageQualityScore: Math.round(avgQuality * 100) / 100,
highQualityContent: job.results.filter(r => r.analysis.contentQuality > 70).length,
mediumQualityContent: job.results.filter(r =>
r.analysis.contentQuality > 40 && r.analysis.contentQuality <= 70
).length,
lowQualityContent: job.results.filter(r => r.analysis.contentQuality <= 40).length
},
insights: {
sentimentDistribution,
topTopics,
avgReadabilityScore: Math.round(
job.results.reduce((sum, r) => sum + r.analysis.readability, 0) / job.results.length * 100
) / 100
}
}
}
}
// Usage Example
const pipeline = new DataExtractionPipeline('YOUR_API_KEY', {
concurrencyLimit: 3,
retryAttempts: 2
})
// Process a large batch of URLs
const urlsToProcess = [
'https://example1.com/article',
'https://example2.com/blog-post',
// ... hundreds more URLs
]
const job = await pipeline.processUrlBatch(urlsToProcess, 'job-001')
const report = await pipeline.generateReport(job)
console.log('Pipeline Report:', report)
Content Filtering and Quality Assessment
Not all extracted content is valuable. Implementing smart filtering ensures you only process high-quality data:
interface ContentFilter {
name: string
apply: (content: EnrichedContent) => boolean
priority: number
}
class ContentQualityManager {
private filters: ContentFilter[] = []
constructor() {
this.initializeDefaultFilters()
}
private initializeDefaultFilters() {
// Minimum content length filter
this.addFilter({
name: 'MinimumLength',
apply: (content) => content.page.contentLength >= 500,
priority: 1
})
// Quality score filter
this.addFilter({
name: 'QualityScore',
apply: (content) => content.analysis.contentQuality >= 40,
priority: 2
})
// Language filter (English content)
this.addFilter({
name: 'EnglishContent',
apply: (content) => this.isEnglishContent(content.page.content),
priority: 3
})
// Spam content filter
this.addFilter({
name: 'AntiSpam',
apply: (content) => !this.isSpamContent(content.page.content),
priority: 4
})
// Duplicate content filter
this.addFilter({
name: 'UniqueContent',
apply: (content) => !this.isDuplicateContent(content.page.content),
priority: 5
})
}
addFilter(filter: ContentFilter) {
this.filters.push(filter)
this.filters.sort((a, b) => a.priority - b.priority)
}
applyFilters(content: EnrichedContent[]): {
passed: EnrichedContent[]
filtered: Array<{
content: EnrichedContent
failedFilters: string[]
}>
} {
const passed: EnrichedContent[] = []
const filtered: Array<{ content: EnrichedContent, failedFilters: string[] }> = []
for (const item of content) {
const failedFilters: string[] = []
for (const filter of this.filters) {
if (!filter.apply(item)) {
failedFilters.push(filter.name)
}
}
if (failedFilters.length === 0) {
passed.push(item)
} else {
filtered.push({ content: item, failedFilters })
}
}
return { passed, filtered }
}
private isEnglishContent(content: string): boolean {
// Simple English detection using common English words
const englishWords = [
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all',
'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day'
]
const words = content.toLowerCase().match(/\b[a-z]+\b/g) || []
const englishWordCount = words.filter(word =>
englishWords.includes(word)
).length
return (englishWordCount / words.length) > 0.1 // At least 10% common English words
}
private isSpamContent(content: string): boolean {
const spamIndicators = [
'click here', 'buy now', 'limited time', 'act now',
'free money', 'make money fast', 'guaranteed',
'congratulations you have won'
]
const text = content.toLowerCase()
const spamCount = spamIndicators.filter(indicator =>
text.includes(indicator)
).length
// If more than 2 spam indicators, consider it spam
return spamCount > 2
}
private duplicateHashes = new Set<string>()
private isDuplicateContent(content: string): boolean {
// Simple content hashing for duplicate detection
const hash = this.simpleHash(content.substring(0, 1000)) // First 1000 chars
if (this.duplicateHashes.has(hash)) {
return true
}
this.duplicateHashes.add(hash)
return false
}
private simpleHash(str: string): string {
let hash = 0
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i)
hash = ((hash << 5) - hash) + char
hash = hash & hash // Convert to 32-bit integer
}
return hash.toString()
}
getFilterStats(results: { passed: EnrichedContent[], filtered: any[] }): any {
const total = results.passed.length + results.filtered.length
// Count failures by filter
const filterFailureCounts = new Map<string, number>()
results.filtered.forEach(item => {
item.failedFilters.forEach((filter: string) => {
filterFailureCounts.set(filter, (filterFailureCounts.get(filter) || 0) + 1)
})
})
return {
total,
passed: results.passed.length,
filtered: results.filtered.length,
passRate: Math.round((results.passed.length / total) * 100),
filterBreakdown: Object.fromEntries(filterFailureCounts)
}
}
}
// Usage Example
const qualityManager = new ContentQualityManager()
// Add custom filter for technical content
qualityManager.addFilter({
name: 'TechnicalContent',
apply: (content) => {
const techKeywords = ['api', 'algorithm', 'database', 'framework', 'javascript', 'python']
const text = content.page.content.toLowerCase()
return techKeywords.some(keyword => text.includes(keyword))
},
priority: 6
})
// Apply filters to extracted content
const extractedContent: EnrichedContent[] = [] // Your extracted content
const filterResults = qualityManager.applyFilters(extractedContent)
const stats = qualityManager.getFilterStats(filterResults)
console.log('Filter Results:', stats)
console.log(`High-quality content: ${filterResults.passed.length} items`)
Key Takeaways
- Leverage Rich Metadata: Use all available metadata fields to enrich your analysis
- Implement Quality Scoring: Develop metrics to automatically assess content value
- Build Robust Pipelines: Handle failures gracefully and implement proper retry logic
- Filter Intelligently: Remove low-quality content before processing to save resources
- Monitor Performance: Track extraction success rates and processing times
- Extract Insights: Go beyond raw text to identify entities, topics, and sentiment
Advanced data extraction transforms raw web content into structured, actionable intelligence. By implementing these techniques, you can build powerful systems that not only collect data but understand and analyze it at scale.
Ready to implement advanced extraction? Contact our technical team for guidance on building production-ready data extraction pipelines.