Overview
Aggregating news from multiple sources inevitably introduces duplicate articles — the same story reported by different outlets, or syndicated content appearing in multiple feeds. This guide builds a system that fetches, deduplicates, and scores articles from multiple RSS feeds.
Pipeline
RSS Feeds ──► Fetch & Parse ──► Normalize ──► Deduplicate ──► Score ──► Output
(10+) (parallel) (clean) (Jaccard) (rank) (JSON/MDX)Requirements
npm install xml2js node-fetchProcess
Step 1: Feed Configuration
// config/feeds.js
export const FEEDS = [
{
name: "The Hacker News",
url: "https://feeds.feedburner.com/TheHackersNews",
category: "security",
weight: 1.2, // Higher weight = more trusted source
},
{
name: "BleepingComputer",
url: "https://www.bleepingcomputer.com/feed/",
category: "security",
weight: 1.1,
},
{
name: "TechCrunch",
url: "https://techcrunch.com/feed/",
category: "tech",
weight: 1.0,
},
{
name: "Ars Technica",
url: "https://feeds.arstechnica.com/arstechnica/index",
category: "tech",
weight: 1.0,
},
// Add more feeds...
];Step 2: RSS Fetcher
// lib/fetcher.js
import { parseStringPromise } from "xml2js";
export async function fetchFeed(feed) {
try {
const response = await fetch(feed.url, {
signal: AbortSignal.timeout(10000),
headers: { "User-Agent": "CosmicBytez News Aggregator/1.0" },
});
if (!response.ok) throw new Error(`HTTP ${response.status}`);
const xml = await response.text();
const parsed = await parseStringPromise(xml, { trim: true });
// Handle both RSS 2.0 and Atom formats
const items = parsed.rss?.channel?.[0]?.item
|| parsed.feed?.entry
|| [];
return items.map((item) => normalizeItem(item, feed));
} catch (err) {
console.warn(`Failed to fetch ${feed.name}: ${err.message}`);
return [];
}
}
function normalizeItem(item, feed) {
// RSS 2.0 format
if (item.title?.[0]) {
return {
title: cleanText(item.title[0]),
link: item.link?.[0] || "",
description: cleanText(item.description?.[0] || ""),
pubDate: new Date(item.pubDate?.[0] || Date.now()),
source: feed.name,
category: feed.category,
weight: feed.weight,
};
}
// Atom format
return {
title: cleanText(item.title?.[0]?._ || item.title?.[0] || ""),
link: item.link?.[0]?.$.href || "",
description: cleanText(item.summary?.[0]?._ || item.summary?.[0] || ""),
pubDate: new Date(item.updated?.[0] || item.published?.[0] || Date.now()),
source: feed.name,
category: feed.category,
weight: feed.weight,
};
}
function cleanText(text) {
if (typeof text !== "string") return "";
return text
.replace(/<[^>]*>/g, "") // Strip HTML tags
.replace(/&[a-z]+;/gi, " ") // Remove HTML entities
.replace(/\s+/g, " ") // Normalize whitespace
.trim();
}
export async function fetchAllFeeds(feeds) {
const results = await Promise.allSettled(
feeds.map((feed) => fetchFeed(feed))
);
return results
.filter((r) => r.status === "fulfilled")
.flatMap((r) => r.value);
}Step 3: Jaccard Similarity Deduplication
// lib/deduplicator.js
/**
* Calculate Jaccard similarity between two strings.
* J(A,B) = |A ∩ B| / |A ∪ B|
* Returns 0 (completely different) to 1 (identical).
*/
function jaccardSimilarity(textA, textB) {
const tokensA = tokenize(textA);
const tokensB = tokenize(textB);
if (tokensA.size === 0 && tokensB.size === 0) return 1;
if (tokensA.size === 0 || tokensB.size === 0) return 0;
let intersection = 0;
for (const token of tokensA) {
if (tokensB.has(token)) intersection++;
}
const union = tokensA.size + tokensB.size - intersection;
return intersection / union;
}
function tokenize(text) {
return new Set(
text
.toLowerCase()
.replace(/[^\w\s]/g, "")
.split(/\s+/)
.filter((w) => w.length > 2) // Skip short words
);
}
/**
* Remove duplicate articles based on title similarity.
* When duplicates are found, keep the one from the highest-weighted source.
*/
export function deduplicate(articles, threshold = 0.6) {
const unique = [];
const seen = [];
for (const article of articles) {
let isDuplicate = false;
for (let i = 0; i < seen.length; i++) {
const similarity = jaccardSimilarity(article.title, seen[i].title);
if (similarity >= threshold) {
isDuplicate = true;
// Keep the one with higher source weight
if (article.weight > seen[i].weight) {
// Replace existing with higher-weight source
const idx = unique.indexOf(seen[i]);
if (idx !== -1) {
unique[idx] = { ...article, duplicateCount: (unique[idx].duplicateCount || 1) + 1 };
seen[i] = article;
}
} else {
// Just increment duplicate count
const idx = unique.indexOf(seen[i]);
if (idx !== -1) {
unique[idx].duplicateCount = (unique[idx].duplicateCount || 1) + 1;
}
}
break;
}
}
if (!isDuplicate) {
article.duplicateCount = 1;
unique.push(article);
seen.push(article);
}
}
return unique;
}Step 4: Relevance Scoring
// lib/scorer.js
export function scoreArticles(articles) {
const now = Date.now();
return articles
.map((article) => {
let score = 0;
// Recency: newer = higher score (exponential decay)
const hoursAgo = (now - article.pubDate.getTime()) / (1000 * 60 * 60);
score += Math.max(0, 100 - hoursAgo * 2);
// Source trust weight
score *= article.weight;
// Duplicate count boost: covered by multiple sources = more important
if (article.duplicateCount > 1) {
score *= 1 + (article.duplicateCount - 1) * 0.2;
}
// Title engagement signals
const titleLower = article.title.toLowerCase();
if (titleLower.includes("critical") || titleLower.includes("zero-day")) {
score *= 1.3;
}
if (titleLower.includes("vulnerability") || titleLower.includes("breach")) {
score *= 1.2;
}
return { ...article, score: Math.round(score) };
})
.sort((a, b) => b.score - a.score);
}Step 5: Output Generator
// lib/output.js
import fs from "fs/promises";
import path from "path";
export async function generateMDX(articles, outputDir, maxArticles = 10) {
const top = articles.slice(0, maxArticles);
const date = new Date().toISOString().split("T")[0];
const frontmatter = `---
title: "Tech & Security News Roundup — ${date}"
excerpt: "Top ${top.length} stories from ${new Set(top.map(a => a.source)).size} sources."
date: ${date}
author: "News Aggregator"
authorRole: "News Desk"
tags: ${JSON.stringify([...new Set(top.flatMap(a => [a.category]))])}
featured: true
category: "Roundup"
sources: ${JSON.stringify([...new Set(top.map(a => a.source))])}
---`;
const body = top
.map((a, i) => `
## ${i + 1}. ${a.title}
${a.description}
**Source:** ${a.source} | **Published:** ${a.pubDate.toLocaleDateString()}${
a.duplicateCount > 1
? ` | *Reported by ${a.duplicateCount} sources*`
: ""
}
[Read full article →](${a.link})
`)
.join("\n---\n");
const content = `${frontmatter}\n\n${body}`;
const filename = `news-roundup-${date}.mdx`;
await fs.writeFile(path.join(outputDir, filename), content);
return filename;
}Step 6: Main Script
// main.js
import { FEEDS } from "./config/feeds.js";
import { fetchAllFeeds } from "./lib/fetcher.js";
import { deduplicate } from "./lib/deduplicator.js";
import { scoreArticles } from "./lib/scorer.js";
import { generateMDX } from "./lib/output.js";
async function run() {
console.log(`Fetching from ${FEEDS.length} feeds...`);
const articles = await fetchAllFeeds(FEEDS);
console.log(` Fetched ${articles.length} articles`);
const unique = deduplicate(articles, 0.6);
console.log(` After deduplication: ${unique.length} unique articles`);
console.log(` Removed ${articles.length - unique.length} duplicates`);
const scored = scoreArticles(unique);
const filename = await generateMDX(scored, "./content/news");
console.log(` Generated: ${filename}`);
}
run();Similarity Threshold Tuning
| Threshold | Behavior | Best For |
|---|---|---|
| 0.4 | Aggressive — catches loosely related articles | Few sources, noisy feeds |
| 0.6 | Balanced — catches same-story rewrites | General use |
| 0.8 | Conservative — only near-identical titles | Many unique sources |
Key Takeaways
- Jaccard similarity on tokenized titles is fast and effective for deduplication
- Source weights let you prefer trusted outlets when duplicates are found
- Articles covered by multiple sources deserve higher relevance scores
- Always set fetch timeouts — RSS feeds are unreliable
- Schedule the script (cron/GitHub Actions) for automated daily aggregation