Automated News Aggregation with Deduplication Algorithms

Overview

Aggregating news from multiple sources inevitably introduces duplicate articles — the same story reported by different outlets, or syndicated content appearing in multiple feeds. This guide builds a system that fetches, deduplicates, and scores articles from multiple RSS feeds.

Pipeline

RSS Feeds ──► Fetch & Parse ──► Normalize ──► Deduplicate ──► Score ──► Output
  (10+)         (parallel)       (clean)     (Jaccard)     (rank)    (JSON/MDX)

Requirements

npm install xml2js node-fetch

Process

Step 1: Feed Configuration

// config/feeds.js
export const FEEDS = [
  {
    name: "The Hacker News",
    url: "https://feeds.feedburner.com/TheHackersNews",
    category: "security",
    weight: 1.2,  // Higher weight = more trusted source
  },
  {
    name: "BleepingComputer",
    url: "https://www.bleepingcomputer.com/feed/",
    category: "security",
    weight: 1.1,
  },
  {
    name: "TechCrunch",
    url: "https://techcrunch.com/feed/",
    category: "tech",
    weight: 1.0,
  },
  {
    name: "Ars Technica",
    url: "https://feeds.arstechnica.com/arstechnica/index",
    category: "tech",
    weight: 1.0,
  },
  // Add more feeds...
];

Step 2: RSS Fetcher

// lib/fetcher.js
import { parseStringPromise } from "xml2js";
 
export async function fetchFeed(feed) {
  try {
    const response = await fetch(feed.url, {
      signal: AbortSignal.timeout(10000),
      headers: { "User-Agent": "CosmicBytez News Aggregator/1.0" },
    });
 
    if (!response.ok) throw new Error(`HTTP ${response.status}`);
 
    const xml = await response.text();
    const parsed = await parseStringPromise(xml, { trim: true });
 
    // Handle both RSS 2.0 and Atom formats
    const items = parsed.rss?.channel?.[0]?.item
      || parsed.feed?.entry
      || [];
 
    return items.map((item) => normalizeItem(item, feed));
  } catch (err) {
    console.warn(`Failed to fetch ${feed.name}: ${err.message}`);
    return [];
  }
}
 
function normalizeItem(item, feed) {
  // RSS 2.0 format
  if (item.title?.[0]) {
    return {
      title: cleanText(item.title[0]),
      link: item.link?.[0] || "",
      description: cleanText(item.description?.[0] || ""),
      pubDate: new Date(item.pubDate?.[0] || Date.now()),
      source: feed.name,
      category: feed.category,
      weight: feed.weight,
    };
  }
 
  // Atom format
  return {
    title: cleanText(item.title?.[0]?._ || item.title?.[0] || ""),
    link: item.link?.[0]?.$.href || "",
    description: cleanText(item.summary?.[0]?._ || item.summary?.[0] || ""),
    pubDate: new Date(item.updated?.[0] || item.published?.[0] || Date.now()),
    source: feed.name,
    category: feed.category,
    weight: feed.weight,
  };
}
 
function cleanText(text) {
  if (typeof text !== "string") return "";
  return text
    .replace(/<[^>]*>/g, "")      // Strip HTML tags
    .replace(/&[a-z]+;/gi, " ")   // Remove HTML entities
    .replace(/\s+/g, " ")         // Normalize whitespace
    .trim();
}
 
export async function fetchAllFeeds(feeds) {
  const results = await Promise.allSettled(
    feeds.map((feed) => fetchFeed(feed))
  );
 
  return results
    .filter((r) => r.status === "fulfilled")
    .flatMap((r) => r.value);
}

Step 3: Jaccard Similarity Deduplication

// lib/deduplicator.js
 
/**
 * Calculate Jaccard similarity between two strings.
 * J(A,B) = |A ∩ B| / |A ∪ B|
 * Returns 0 (completely different) to 1 (identical).
 */
function jaccardSimilarity(textA, textB) {
  const tokensA = tokenize(textA);
  const tokensB = tokenize(textB);
 
  if (tokensA.size === 0 && tokensB.size === 0) return 1;
  if (tokensA.size === 0 || tokensB.size === 0) return 0;
 
  let intersection = 0;
  for (const token of tokensA) {
    if (tokensB.has(token)) intersection++;
  }
 
  const union = tokensA.size + tokensB.size - intersection;
  return intersection / union;
}
 
function tokenize(text) {
  return new Set(
    text
      .toLowerCase()
      .replace(/[^\w\s]/g, "")
      .split(/\s+/)
      .filter((w) => w.length > 2) // Skip short words
  );
}
 
/**
 * Remove duplicate articles based on title similarity.
 * When duplicates are found, keep the one from the highest-weighted source.
 */
export function deduplicate(articles, threshold = 0.6) {
  const unique = [];
  const seen = [];
 
  for (const article of articles) {
    let isDuplicate = false;
 
    for (let i = 0; i < seen.length; i++) {
      const similarity = jaccardSimilarity(article.title, seen[i].title);
 
      if (similarity >= threshold) {
        isDuplicate = true;
        // Keep the one with higher source weight
        if (article.weight > seen[i].weight) {
          // Replace existing with higher-weight source
          const idx = unique.indexOf(seen[i]);
          if (idx !== -1) {
            unique[idx] = { ...article, duplicateCount: (unique[idx].duplicateCount || 1) + 1 };
            seen[i] = article;
          }
        } else {
          // Just increment duplicate count
          const idx = unique.indexOf(seen[i]);
          if (idx !== -1) {
            unique[idx].duplicateCount = (unique[idx].duplicateCount || 1) + 1;
          }
        }
        break;
      }
    }
 
    if (!isDuplicate) {
      article.duplicateCount = 1;
      unique.push(article);
      seen.push(article);
    }
  }
 
  return unique;
}

Step 4: Relevance Scoring

// lib/scorer.js
 
export function scoreArticles(articles) {
  const now = Date.now();
 
  return articles
    .map((article) => {
      let score = 0;
 
      // Recency: newer = higher score (exponential decay)
      const hoursAgo = (now - article.pubDate.getTime()) / (1000 * 60 * 60);
      score += Math.max(0, 100 - hoursAgo * 2);
 
      // Source trust weight
      score *= article.weight;
 
      // Duplicate count boost: covered by multiple sources = more important
      if (article.duplicateCount > 1) {
        score *= 1 + (article.duplicateCount - 1) * 0.2;
      }
 
      // Title engagement signals
      const titleLower = article.title.toLowerCase();
      if (titleLower.includes("critical") || titleLower.includes("zero-day")) {
        score *= 1.3;
      }
      if (titleLower.includes("vulnerability") || titleLower.includes("breach")) {
        score *= 1.2;
      }
 
      return { ...article, score: Math.round(score) };
    })
    .sort((a, b) => b.score - a.score);
}

Step 5: Output Generator

// lib/output.js
import fs from "fs/promises";
import path from "path";
 
export async function generateMDX(articles, outputDir, maxArticles = 10) {
  const top = articles.slice(0, maxArticles);
  const date = new Date().toISOString().split("T")[0];
 
  const frontmatter = `---
title: "Tech & Security News Roundup — ${date}"
excerpt: "Top ${top.length} stories from ${new Set(top.map(a => a.source)).size} sources."
date: ${date}
author: "News Aggregator"
authorRole: "News Desk"
tags: ${JSON.stringify([...new Set(top.flatMap(a => [a.category]))])}
featured: true
category: "Roundup"
sources: ${JSON.stringify([...new Set(top.map(a => a.source))])}
---`;
 
  const body = top
    .map((a, i) => `
## ${i + 1}. ${a.title}
 
${a.description}
 
**Source:** ${a.source} | **Published:** ${a.pubDate.toLocaleDateString()}${
      a.duplicateCount > 1
        ? ` | *Reported by ${a.duplicateCount} sources*`
        : ""
    }
 
[Read full article →](${a.link})
`)
    .join("\n---\n");
 
  const content = `${frontmatter}\n\n${body}`;
  const filename = `news-roundup-${date}.mdx`;
  await fs.writeFile(path.join(outputDir, filename), content);
 
  return filename;
}

Step 6: Main Script

// main.js
import { FEEDS } from "./config/feeds.js";
import { fetchAllFeeds } from "./lib/fetcher.js";
import { deduplicate } from "./lib/deduplicator.js";
import { scoreArticles } from "./lib/scorer.js";
import { generateMDX } from "./lib/output.js";
 
async function run() {
  console.log(`Fetching from ${FEEDS.length} feeds...`);
  const articles = await fetchAllFeeds(FEEDS);
  console.log(`  Fetched ${articles.length} articles`);
 
  const unique = deduplicate(articles, 0.6);
  console.log(`  After deduplication: ${unique.length} unique articles`);
  console.log(`  Removed ${articles.length - unique.length} duplicates`);
 
  const scored = scoreArticles(unique);
  const filename = await generateMDX(scored, "./content/news");
  console.log(`  Generated: ${filename}`);
}
 
run();

Similarity Threshold Tuning

Threshold	Behavior	Best For
0.4	Aggressive — catches loosely related articles	Few sources, noisy feeds
0.6	Balanced — catches same-story rewrites	General use
0.8	Conservative — only near-identical titles	Many unique sources

Key Takeaways

Jaccard similarity on tokenized titles is fast and effective for deduplication
Source weights let you prefer trusted outlets when duplicates are found
Articles covered by multiple sources deserve higher relevance scores
Always set fetch timeouts — RSS feeds are unreliable
Schedule the script (cron/GitHub Actions) for automated daily aggregation

Overview

Pipeline

RSS Feeds ──► Fetch & Parse ──► Normalize ──► Deduplicate ──► Score ──► Output
  (10+)         (parallel)       (clean)     (Jaccard)     (rank)    (JSON/MDX)

Requirements

npm install xml2js node-fetch

Process

Step 1: Feed Configuration

// config/feeds.js
export const FEEDS = [
  {
    name: "The Hacker News",
    url: "https://feeds.feedburner.com/TheHackersNews",
    category: "security",
    weight: 1.2,  // Higher weight = more trusted source
  },
  {
    name: "BleepingComputer",
    url: "https://www.bleepingcomputer.com/feed/",
    category: "security",
    weight: 1.1,
  },
  {
    name: "TechCrunch",
    url: "https://techcrunch.com/feed/",
    category: "tech",
    weight: 1.0,
  },
  {
    name: "Ars Technica",
    url: "https://feeds.arstechnica.com/arstechnica/index",
    category: "tech",
    weight: 1.0,
  },
  // Add more feeds...
];

Step 2: RSS Fetcher

// lib/fetcher.js
import { parseStringPromise } from "xml2js";
 
export async function fetchFeed(feed) {
  try {
    const response = await fetch(feed.url, {
      signal: AbortSignal.timeout(10000),
      headers: { "User-Agent": "CosmicBytez News Aggregator/1.0" },
    });
 
    if (!response.ok) throw new Error(`HTTP ${response.status}`);
 
    const xml = await response.text();
    const parsed = await parseStringPromise(xml, { trim: true });
 
    // Handle both RSS 2.0 and Atom formats
    const items = parsed.rss?.channel?.[0]?.item
      || parsed.feed?.entry
      || [];
 
    return items.map((item) => normalizeItem(item, feed));
  } catch (err) {
    console.warn(`Failed to fetch ${feed.name}: ${err.message}`);
    return [];
  }
}
 
function normalizeItem(item, feed) {
  // RSS 2.0 format
  if (item.title?.[0]) {
    return {
      title: cleanText(item.title[0]),
      link: item.link?.[0] || "",
      description: cleanText(item.description?.[0] || ""),
      pubDate: new Date(item.pubDate?.[0] || Date.now()),
      source: feed.name,
      category: feed.category,
      weight: feed.weight,
    };
  }
 
  // Atom format
  return {
    title: cleanText(item.title?.[0]?._ || item.title?.[0] || ""),
    link: item.link?.[0]?.$.href || "",
    description: cleanText(item.summary?.[0]?._ || item.summary?.[0] || ""),
    pubDate: new Date(item.updated?.[0] || item.published?.[0] || Date.now()),
    source: feed.name,
    category: feed.category,
    weight: feed.weight,
  };
}
 
function cleanText(text) {
  if (typeof text !== "string") return "";
  return text
    .replace(/<[^>]*>/g, "")      // Strip HTML tags
    .replace(/&[a-z]+;/gi, " ")   // Remove HTML entities
    .replace(/\s+/g, " ")         // Normalize whitespace
    .trim();
}
 
export async function fetchAllFeeds(feeds) {
  const results = await Promise.allSettled(
    feeds.map((feed) => fetchFeed(feed))
  );
 
  return results
    .filter((r) => r.status === "fulfilled")
    .flatMap((r) => r.value);
}

Step 3: Jaccard Similarity Deduplication

// lib/deduplicator.js
 
/**
 * Calculate Jaccard similarity between two strings.
 * J(A,B) = |A ∩ B| / |A ∪ B|
 * Returns 0 (completely different) to 1 (identical).
 */
function jaccardSimilarity(textA, textB) {
  const tokensA = tokenize(textA);
  const tokensB = tokenize(textB);
 
  if (tokensA.size === 0 && tokensB.size === 0) return 1;
  if (tokensA.size === 0 || tokensB.size === 0) return 0;
 
  let intersection = 0;
  for (const token of tokensA) {
    if (tokensB.has(token)) intersection++;
  }
 
  const union = tokensA.size + tokensB.size - intersection;
  return intersection / union;
}
 
function tokenize(text) {
  return new Set(
    text
      .toLowerCase()
      .replace(/[^\w\s]/g, "")
      .split(/\s+/)
      .filter((w) => w.length > 2) // Skip short words
  );
}
 
/**
 * Remove duplicate articles based on title similarity.
 * When duplicates are found, keep the one from the highest-weighted source.
 */
export function deduplicate(articles, threshold = 0.6) {
  const unique = [];
  const seen = [];
 
  for (const article of articles) {
    let isDuplicate = false;
 
    for (let i = 0; i < seen.length; i++) {
      const similarity = jaccardSimilarity(article.title, seen[i].title);
 
      if (similarity >= threshold) {
        isDuplicate = true;
        // Keep the one with higher source weight
        if (article.weight > seen[i].weight) {
          // Replace existing with higher-weight source
          const idx = unique.indexOf(seen[i]);
          if (idx !== -1) {
            unique[idx] = { ...article, duplicateCount: (unique[idx].duplicateCount || 1) + 1 };
            seen[i] = article;
          }
        } else {
          // Just increment duplicate count
          const idx = unique.indexOf(seen[i]);
          if (idx !== -1) {
            unique[idx].duplicateCount = (unique[idx].duplicateCount || 1) + 1;
          }
        }
        break;
      }
    }
 
    if (!isDuplicate) {
      article.duplicateCount = 1;
      unique.push(article);
      seen.push(article);
    }
  }
 
  return unique;
}

Step 4: Relevance Scoring

// lib/scorer.js
 
export function scoreArticles(articles) {
  const now = Date.now();
 
  return articles
    .map((article) => {
      let score = 0;
 
      // Recency: newer = higher score (exponential decay)
      const hoursAgo = (now - article.pubDate.getTime()) / (1000 * 60 * 60);
      score += Math.max(0, 100 - hoursAgo * 2);
 
      // Source trust weight
      score *= article.weight;
 
      // Duplicate count boost: covered by multiple sources = more important
      if (article.duplicateCount > 1) {
        score *= 1 + (article.duplicateCount - 1) * 0.2;
      }
 
      // Title engagement signals
      const titleLower = article.title.toLowerCase();
      if (titleLower.includes("critical") || titleLower.includes("zero-day")) {
        score *= 1.3;
      }
      if (titleLower.includes("vulnerability") || titleLower.includes("breach")) {
        score *= 1.2;
      }
 
      return { ...article, score: Math.round(score) };
    })
    .sort((a, b) => b.score - a.score);
}

Step 5: Output Generator

// lib/output.js
import fs from "fs/promises";
import path from "path";
 
export async function generateMDX(articles, outputDir, maxArticles = 10) {
  const top = articles.slice(0, maxArticles);
  const date = new Date().toISOString().split("T")[0];
 
  const frontmatter = `---
title: "Tech & Security News Roundup — ${date}"
excerpt: "Top ${top.length} stories from ${new Set(top.map(a => a.source)).size} sources."
date: ${date}
author: "News Aggregator"
authorRole: "News Desk"
tags: ${JSON.stringify([...new Set(top.flatMap(a => [a.category]))])}
featured: true
category: "Roundup"
sources: ${JSON.stringify([...new Set(top.map(a => a.source))])}
---`;
 
  const body = top
    .map((a, i) => `
## ${i + 1}. ${a.title}
 
${a.description}
 
**Source:** ${a.source} | **Published:** ${a.pubDate.toLocaleDateString()}${
      a.duplicateCount > 1
        ? ` | *Reported by ${a.duplicateCount} sources*`
        : ""
    }
 
[Read full article →](${a.link})
`)
    .join("\n---\n");
 
  const content = `${frontmatter}\n\n${body}`;
  const filename = `news-roundup-${date}.mdx`;
  await fs.writeFile(path.join(outputDir, filename), content);
 
  return filename;
}

Step 6: Main Script

// main.js
import { FEEDS } from "./config/feeds.js";
import { fetchAllFeeds } from "./lib/fetcher.js";
import { deduplicate } from "./lib/deduplicator.js";
import { scoreArticles } from "./lib/scorer.js";
import { generateMDX } from "./lib/output.js";
 
async function run() {
  console.log(`Fetching from ${FEEDS.length} feeds...`);
  const articles = await fetchAllFeeds(FEEDS);
  console.log(`  Fetched ${articles.length} articles`);
 
  const unique = deduplicate(articles, 0.6);
  console.log(`  After deduplication: ${unique.length} unique articles`);
  console.log(`  Removed ${articles.length - unique.length} duplicates`);
 
  const scored = scoreArticles(unique);
  const filename = await generateMDX(scored, "./content/news");
  console.log(`  Generated: ${filename}`);
}
 
run();

Similarity Threshold Tuning

Threshold	Behavior	Best For
0.4	Aggressive — catches loosely related articles	Few sources, noisy feeds
0.6	Balanced — catches same-story rewrites	General use
0.8	Conservative — only near-identical titles	Many unique sources

Key Takeaways

Jaccard similarity on tokenized titles is fast and effective for deduplication
Source weights let you prefer trusted outlets when duplicates are found
Articles covered by multiple sources deserve higher relevance scores
Always set fetch timeouts — RSS feeds are unreliable
Schedule the script (cron/GitHub Actions) for automated daily aggregation

Automated News Aggregation with Deduplication Algorithms

Prerequisites

Overview

Pipeline

Requirements

Process

Step 1: Feed Configuration

Step 2: RSS Fetcher

Step 3: Jaccard Similarity Deduplication

Step 4: Relevance Scoring

Step 5: Output Generator

Step 6: Main Script

Similarity Threshold Tuning

Key Takeaways

Automating Report Generation with Python and Jinja2

How to Configure Microsoft Sentinel Analytics Rules

SentinelOne Application Control Policies

Automated News Aggregation with Deduplication Algorithms

Prerequisites

Overview

Pipeline

Requirements

Process

Step 1: Feed Configuration

Step 2: RSS Fetcher

Step 3: Jaccard Similarity Deduplication

Step 4: Relevance Scoring

Step 5: Output Generator

Step 6: Main Script

Similarity Threshold Tuning

Key Takeaways

Automating Report Generation with Python and Jinja2

How to Configure Microsoft Sentinel Analytics Rules

SentinelOne Application Control Policies

Prerequisites

Overview

Pipeline

Requirements

Process

Step 1: Feed Configuration

Step 2: RSS Fetcher

Step 3: Jaccard Similarity Deduplication

Step 4: Relevance Scoring

Step 5: Output Generator

Step 6: Main Script

Similarity Threshold Tuning

Key Takeaways

Related Reading

Prerequisites

Overview

Pipeline

Requirements

Process

Step 1: Feed Configuration

Step 2: RSS Fetcher

Step 3: Jaccard Similarity Deduplication

Step 4: Relevance Scoring

Step 5: Output Generator

Step 6: Main Script

Similarity Threshold Tuning

Key Takeaways

Related Reading