Content Quality Filtering: Getting the Best Results from Zapserp
Getting great search results is only half the battle. The real value comes from filtering and processing that content to ensure you're working with high-quality, relevant information. This guide shows you how to implement intelligent content filtering to maximize the value of your Zapserp searches.
Why Content Quality Matters
Not all web content is created equal. Poor quality filtering can lead to:
- Misleading or inaccurate information
- Spam and promotional content
- Irrelevant or off-topic results
- Outdated or stale information
- Poor user experience in your applications
Smart filtering ensures your applications deliver reliable, valuable content to users.
Multi-Layer Filtering Strategy
Implement filtering at multiple levels for best results:
import { Zapserp, SearchEngine, SearchResponse, Page } from 'zapserp'
interface QualityFilters {
domainWhitelist?: string[]
domainBlacklist?: string[]
contentMinLength?: number
contentMaxAge?: number // days
languageFilter?: string
spamThreshold?: number
}
class ContentQualityFilter {
private zapserp: Zapserp
private defaultFilters: QualityFilters
constructor(apiKey: string, defaultFilters: QualityFilters = {}) {
this.zapserp = new Zapserp({ apiKey })
this.defaultFilters = {
contentMinLength: 200,
contentMaxAge: 365,
languageFilter: 'en',
spamThreshold: 0.3,
...defaultFilters
}
}
async searchWithQualityFilter(
query: string,
customFilters: Partial<QualityFilters> = {}
) {
const filters = { ...this.defaultFilters, ...customFilters }
// Step 1: Get search results
const searchResponse = await this.zapserp.search({
query,
engines: [SearchEngine.GOOGLE, SearchEngine.BING],
limit: 20, // Get more results to filter from
language: filters.languageFilter
})
// Step 2: Filter URLs by domain quality
const qualityUrls = this.filterByDomain(
searchResponse.results.map(r => r.url),
filters
)
if (qualityUrls.length === 0) {
return { results: [], filtered: searchResponse.results.length }
}
// Step 3: Extract content from quality URLs
const contentResponse = await this.zapserp.readerBatch({
urls: qualityUrls.slice(0, 10) // Limit content extraction
})
// Step 4: Apply content-level filters
const qualityContent = this.filterByContent(
contentResponse.results,
filters
)
// Step 5: Score and rank results
const scoredResults = this.scoreAndRank(qualityContent, query)
return {
results: scoredResults,
filtered: searchResponse.results.length - scoredResults.length,
qualityScore: this.calculateAverageQuality(scoredResults)
}
}
private filterByDomain(urls: string[], filters: QualityFilters): string[] {
return urls.filter(url => {
const domain = this.extractDomain(url)
// Apply whitelist if provided
if (filters.domainWhitelist && filters.domainWhitelist.length > 0) {
return filters.domainWhitelist.some(allowed =>
domain.includes(allowed.toLowerCase())
)
}
// Apply blacklist
if (filters.domainBlacklist && filters.domainBlacklist.length > 0) {
if (filters.domainBlacklist.some(blocked =>
domain.includes(blocked.toLowerCase())
)) {
return false
}
}
// Default quality domains
return this.isQualityDomain(domain)
})
}
private isQualityDomain(domain: string): boolean {
const highQualityDomains = [
// News & Media
'reuters.com', 'bbc.com', 'cnn.com', 'nytimes.com', 'wsj.com',
'bloomberg.com', 'guardian.com', 'washingtonpost.com', 'npr.org',
// Technology
'techcrunch.com', 'wired.com', 'arstechnica.com', 'theverge.com',
'stackoverflow.com', 'github.com', 'developer.mozilla.org',
// Academic & Research
'wikipedia.org', 'arxiv.org', 'nature.com', 'science.org',
'pubmed.ncbi.nlm.nih.gov', 'scholar.google.com',
// Business & Finance
'harvard.edu', 'mit.edu', 'stanford.edu', 'investopedia.com'
]
const lowQualityDomains = [
'pinterest.com', 'youtube.com', 'facebook.com', 'twitter.com',
'instagram.com', 'tiktok.com', 'reddit.com'
]
// Check if it's a known low-quality domain
if (lowQualityDomains.some(blocked => domain.includes(blocked))) {
return false
}
// Check if it's a known high-quality domain
if (highQualityDomains.some(quality => domain.includes(quality))) {
return true
}
// Default checks for unknown domains
return !domain.includes('spam') &&
!domain.includes('ads') &&
!domain.includes('affiliate')
}
private filterByContent(pages: Page[], filters: QualityFilters): Page[] {
return pages.filter(page => {
if (!page || !page.content) return false
// Content length filter
if (page.content.length < (filters.contentMinLength || 200)) {
return false
}
// Age filter
if (filters.contentMaxAge && page.metadata?.publishedTime) {
const publishedDate = new Date(page.metadata.publishedTime)
const daysSincePublished = (Date.now() - publishedDate.getTime()) / (1000 * 60 * 60 * 24)
if (daysSincePublished > filters.contentMaxAge) {
return false
}
}
// Spam detection
const spamScore = this.calculateSpamScore(page.content)
if (spamScore > (filters.spamThreshold || 0.3)) {
return false
}
// Language detection (basic)
if (filters.languageFilter && !this.isTargetLanguage(page.content, filters.languageFilter)) {
return false
}
return true
})
}
private calculateSpamScore(content: string): number {
const text = content.toLowerCase()
let spamScore = 0
// Spam indicators
const spamPhrases = [
'click here', 'buy now', 'limited time', 'act fast', 'free money',
'guaranteed', 'no risk', 'make money fast', 'work from home',
'lose weight fast', 'miracle cure', 'amazing results'
]
const spamCount = spamPhrases.filter(phrase => text.includes(phrase)).length
spamScore += spamCount * 0.1
// Excessive capitalization
const capitalWords = content.match(/\b[A-Z]{3,}\b/g) || []
spamScore += Math.min(capitalWords.length * 0.02, 0.2)
// Excessive exclamation marks
const exclamationCount = (content.match(/!/g) || []).length
spamScore += Math.min(exclamationCount * 0.01, 0.1)
// Excessive repetition
const words = text.split(/\s+/)
const uniqueWords = new Set(words)
const repetitionRatio = 1 - (uniqueWords.size / words.length)
spamScore += repetitionRatio > 0.7 ? 0.3 : 0
return Math.min(spamScore, 1)
}
private isTargetLanguage(content: string, targetLang: string): boolean {
if (targetLang !== 'en') return true // Only implement English detection for now
const englishWords = [
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all',
'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day'
]
const words = content.toLowerCase().match(/\b[a-z]+\b/g) || []
const englishWordCount = words.filter(word => englishWords.includes(word)).length
return (englishWordCount / words.length) > 0.05 // At least 5% common English words
}
private scoreAndRank(pages: Page[], query: string): Array<Page & { qualityScore: number }> {
const queryWords = query.toLowerCase().split(/\s+/)
return pages.map(page => {
let score = 0
// Relevance scoring
const content = page.content.toLowerCase()
const title = page.title?.toLowerCase() || ''
queryWords.forEach(word => {
// Title matches (high weight)
if (title.includes(word)) score += 3
// Content matches
const contentMatches = (content.match(new RegExp(word, 'g')) || []).length
score += Math.min(contentMatches * 0.1, 2)
})
// Content quality indicators
const wordCount = page.content.split(/\s+/).length
// Optimal length bonus
if (wordCount >= 500 && wordCount <= 2000) score += 2
else if (wordCount >= 200) score += 1
// Metadata completeness
if (page.metadata?.author) score += 1
if (page.metadata?.publishedTime) score += 1
if (page.metadata?.description) score += 0.5
// Structure indicators
if (page.content.includes('\n\n')) score += 0.5 // Paragraphs
if (/#{1,6}\s/.test(page.content)) score += 0.5 // Headers
return {
...page,
qualityScore: Math.round(score * 10) / 10
}
}).sort((a, b) => b.qualityScore - a.qualityScore)
}
private calculateAverageQuality(results: Array<{ qualityScore: number }>): number {
if (results.length === 0) return 0
const total = results.reduce((sum, result) => sum + result.qualityScore, 0)
return Math.round((total / results.length) * 10) / 10
}
private extractDomain(url: string): string {
try {
return new URL(url).hostname.replace('www.', '').toLowerCase()
} catch {
return 'unknown'
}
}
}
Practical Usage Examples
News Content Filtering
const newsFilter = new ContentQualityFilter('YOUR_API_KEY', {
domainWhitelist: [
'reuters.com', 'bbc.com', 'cnn.com', 'nytimes.com',
'wsj.com', 'bloomberg.com', 'guardian.com'
],
contentMinLength: 300,
contentMaxAge: 7, // Only recent news
spamThreshold: 0.2 // Stricter spam filtering
})
const newsResults = await newsFilter.searchWithQualityFilter(
'artificial intelligence latest developments'
)
console.log(`Found ${newsResults.results.length} quality news articles`)
console.log(`Average quality score: ${newsResults.qualityScore}`)
Academic Research Filtering
const academicFilter = new ContentQualityFilter('YOUR_API_KEY', {
domainWhitelist: [
'arxiv.org', 'nature.com', 'science.org', 'pubmed.ncbi.nlm.nih.gov',
'ieee.org', 'acm.org', 'springer.com', 'wiley.com'
],
contentMinLength: 1000, // Longer content expected
spamThreshold: 0.1 // Very strict spam filtering
})
const researchResults = await academicFilter.searchWithQualityFilter(
'machine learning neural networks research'
)
Technical Documentation Filtering
const techFilter = new ContentQualityFilter('YOUR_API_KEY', {
domainWhitelist: [
'stackoverflow.com', 'github.com', 'developer.mozilla.org',
'docs.microsoft.com', 'aws.amazon.com', 'cloud.google.com'
],
domainBlacklist: ['youtube.com', 'pinterest.com'],
contentMinLength: 150
})
const techResults = await techFilter.searchWithQualityFilter(
'React hooks best practices tutorial'
)
Advanced Filtering Techniques
Content Deduplication
class AdvancedContentFilter extends ContentQualityFilter {
private deduplicateContent(pages: Page[]): Page[] {
const seen = new Set<string>()
return pages.filter(page => {
// Create content fingerprint
const fingerprint = this.createContentFingerprint(page.content)
if (seen.has(fingerprint)) {
return false
}
seen.add(fingerprint)
return true
})
}
private createContentFingerprint(content: string): string {
// Use first 200 characters after normalization
return content
.toLowerCase()
.replace(/\s+/g, ' ')
.trim()
.substring(0, 200)
}
}
Dynamic Quality Thresholds
private adaptiveQualityThreshold(results: any[], targetCount: number): any[] {
if (results.length <= targetCount) return results
// Sort by quality score
const sorted = results.sort((a, b) => b.qualityScore - a.qualityScore)
// Find natural break point
const qualityScores = sorted.map(r => r.qualityScore)
const threshold = qualityScores[targetCount - 1]
return sorted.filter(r => r.qualityScore >= threshold)
}
Testing Your Filters
Always test your filtering logic:
// Test with known good and bad content
const testCases = [
{ query: 'climate change research', expectedMinQuality: 7 },
{ query: 'tech startup news', expectedMinQuality: 6 },
{ query: 'cryptocurrency analysis', expectedMinQuality: 5 }
]
for (const testCase of testCases) {
const results = await filter.searchWithQualityFilter(testCase.query)
console.log(`${testCase.query}: Average quality ${results.qualityScore} (expected: ${testCase.expectedMinQuality})`)
}
Best Practices
- Layer Your Filters: Apply multiple filtering stages for best results
- Monitor Quality Scores: Track average quality over time to tune filters
- Domain Lists: Maintain and update your quality domain lists regularly
- Content Age: Consider freshness requirements for your specific use case
- Spam Detection: Implement multiple spam indicators for robust filtering
- Performance: Balance quality filtering with response times
Conclusion
Implementing smart content quality filtering transforms raw search results into valuable, reliable information. By filtering at multiple levels—domain, content, and relevance—you ensure your applications deliver high-quality content that users can trust.
The key is finding the right balance between filtering strictness and result quantity for your specific use case. Start with conservative filters and adjust based on your quality requirements and user feedback.
Ready to implement quality filtering? Start with the basic domain filtering and gradually add more sophisticated content analysis as your needs evolve.