implement url cleaning based on ClearURLs rules and logic

This commit is contained in:
m5r
2025-09-20 11:48:38 +02:00
parent ca369dc516
commit 44ebc2b487
15 changed files with 8063 additions and 6424 deletions

241
src/cleaner.ts Normal file
View File

@@ -0,0 +1,241 @@
import type { ClearURLsRules } from "./types";
export async function cleanUrl(inputUrl: string, rules: ClearURLsRules, maxRedirects = 5, visited = new Set<string>()): Promise<string> {
try {
const currentUrl = new URL(inputUrl);
if (visited.has(currentUrl.href) || visited.size >= maxRedirects) {
return cleanUrlParameters(currentUrl, rules.providers);
}
visited.add(currentUrl.href);
// Check for ClearURLs redirections first
const clearUrlsRedirect = checkClearUrlsRedirections(currentUrl.href, rules.providers);
if (clearUrlsRedirect && clearUrlsRedirect !== currentUrl.href) {
return await cleanUrl(clearUrlsRedirect, rules, maxRedirects, visited);
}
// Then check for HTTP redirects
const redirectTarget = await followRedirect(currentUrl.href);
if (redirectTarget && redirectTarget !== currentUrl.href) {
return await cleanUrl(redirectTarget, rules, maxRedirects, visited);
}
return cleanUrlParameters(currentUrl, rules.providers);
} catch (error) {
console.error(`Error caught when trying to clean url ${inputUrl}`, error);
return inputUrl;
}
}
async function followRedirect(url: string): Promise<string | null> {
// @ts-ignore - Skip redirect following in tests to avoid external HTTP calls
if (typeof global !== "undefined" && global.process?.env?.NODE_ENV === "test") {
return null;
}
try {
const response = await fetch(url, {
method: "HEAD",
redirect: "manual",
});
if (response.status >= 300 && response.status < 400) {
const location = response.headers.get("Location");
if (location) {
// Handle relative redirects
if (location.startsWith("/")) {
const baseUrl = new URL(url);
return `${baseUrl.protocol}//${baseUrl.host}${location}`;
}
return location;
}
}
return null;
} catch (error) {
return null;
}
}
function cleanUrlParameters(url: URL, providers: ClearURLsRules["providers"]): string {
const matchingProvider = findMatchingProvider(url.href, providers);
if (!matchingProvider) {
return url.href;
}
if (matchingProvider.completeProvider === true) {
throw new Error("URL blocked by ClearURLs rules");
}
if (matchingProvider.exceptions && isException(url.href, matchingProvider.exceptions)) {
return url.href;
}
if (matchingProvider.rules && matchingProvider.rules.length > 0) {
url = cleanParametersByRules(url, matchingProvider.rules);
url = cleanFragmentsByRules(url, matchingProvider.rules);
}
if (matchingProvider.rawRules && matchingProvider.rawRules.length > 0) {
let cleaned = url.href;
for (const rawRule of matchingProvider.rawRules) {
try {
cleaned = cleaned.replace(new RegExp(rawRule, "gi"), "");
} catch (error) {
console.warn(`Invalid raw rule regex: ${rawRule}`, error);
}
}
try {
url = new URL(cleaned);
} catch (error) {
console.warn("Raw rule produced invalid URL, skipping", error);
}
}
return url.href;
}
function findMatchingProvider(url: string, providers: ClearURLsRules["providers"]) {
const { globalRules, ...otherProviders } = providers;
for (const [providerName, provider] of Object.entries(otherProviders)) {
try {
const regex = new RegExp(provider.urlPattern);
if (regex.test(url)) {
return provider;
}
} catch (error) {
console.warn(`Invalid URL pattern for provider ${providerName}: ${provider.urlPattern}`, error);
}
}
return globalRules;
}
function cleanParametersByRules(url: URL, rules: string[]) {
const params = new URLSearchParams(url.search);
const cleanParams = new URLSearchParams();
for (const [key, value] of params) {
let shouldRemove = false;
for (const rule of rules) {
try {
if (new RegExp("^" + rule + "$", "gi").test(key)) {
shouldRemove = true;
break;
}
} catch (error) {
console.warn(`Invalid rule regex: ${rule}`, error);
}
}
if (!shouldRemove) {
cleanParams.set(key, value);
}
}
url.search = cleanParams.toString();
return url;
}
function cleanFragmentsByRules(url: URL, rules: string[]) {
const fragments = extractFragments(url);
const cleanFragments = new Map<string, string | null>();
for (const [key, value] of fragments) {
let shouldRemove = false;
for (const rule of rules) {
try {
if (new RegExp("^" + rule + "$", "gi").test(key)) {
shouldRemove = true;
break;
}
} catch (error) {
console.warn(`Invalid rule regex: ${rule}`, error);
}
}
if (!shouldRemove) {
cleanFragments.set(key, value);
}
}
url.hash = fragmentsToString(cleanFragments);
return url;
}
function extractFragments(url: URL): Map<string, string | null> {
const fragments = new Map<string, string | null>();
const hash = url.hash.slice(1); // Remove the #
if (!hash) return fragments;
const params = hash.split("&");
for (const p of params) {
const param = p.split("=");
if (!param[0]) continue;
const key = param[0];
let value: string | null = null;
if (param.length === 2 && param[1]) {
value = param[1];
}
fragments.set(key, value);
}
return fragments;
}
function fragmentsToString(fragments: Map<string, string | null>): string {
const parts: string[] = [];
for (const [key, value] of fragments) {
if (value !== null) {
parts.push(key + "=" + value);
} else {
parts.push(key);
}
}
return parts.length > 0 ? parts.join("&") : "";
}
function isException(url: string, exceptions: string[]): boolean {
for (const exception of exceptions) {
try {
const regex = new RegExp(exception, "i");
if (regex.test(url)) {
return true;
}
} catch (error) {
console.warn(`Invalid exception regex: ${exception}`, error);
}
}
return false;
}
function checkClearUrlsRedirections(url: string, providers: ClearURLsRules["providers"]): string | null {
const matchingProvider = findMatchingProvider(url, providers);
if (!matchingProvider || !matchingProvider.redirections) {
return null;
}
for (const redirectionPattern of matchingProvider.redirections) {
try {
const regex = new RegExp(redirectionPattern, "i");
const match = url.match(regex);
if (match && match[1]) {
// First capture group is the target URL
return decodeURIComponent(match[1]);
}
} catch (error) {
console.warn(`Invalid redirection regex: ${redirectionPattern}`, error);
}
}
return null;
}

View File

@@ -1,18 +1,36 @@
/**
* Welcome to Cloudflare Workers! This is your first worker.
*
* - Run `npm run dev` in your terminal to start a development server
* - Open a browser tab at http://localhost:8787/ to see your worker in action
* - Run `npm run deploy` to publish your worker
*
* Bind resources to your worker in `wrangler.jsonc`. After adding bindings, a type definition for the
* `Env` object can be regenerated with `npm run cf-typegen`.
*
* Learn more at https://developers.cloudflare.com/workers/
*/
import { cleanUrl } from "./cleaner";
import { RulesCache } from "./rules-cache";
type Env = {
RULES_CACHE: DurableObjectNamespace<RulesCache>;
};
export { RulesCache };
export default {
async fetch(request, env, ctx): Promise<Response> {
return new Response('Hello World!');
async fetch(request, env, _ctx): Promise<Response> {
const url = new URL(request.url);
const targetUrl = url.searchParams.get("url");
if (!targetUrl) {
return new Response("Missing url parameter", { status: 400 });
}
try {
const rulesStub = env.RULES_CACHE.getByName("rules");
const rules = await rulesStub.getRules();
const cleanedUrl = await cleanUrl(targetUrl, rules);
return new Response(cleanedUrl, {
headers: {
"Content-Type": "text/plain",
"Access-Control-Allow-Origin": "*",
},
});
} catch (error) {
return new Response(`Error processing URL: ${error instanceof Error ? error.message : "Unknown error"}`, {
status: 500,
});
}
},
} satisfies ExportedHandler<Env>;

77
src/rules-cache.ts Normal file
View File

@@ -0,0 +1,77 @@
import { DurableObject } from "cloudflare:workers";
import type { ClearURLsRules } from "./types";
const RULES_URL = "https://rules2.clearurls.xyz/data.minify.json";
const HASH_URL = "https://rules2.clearurls.xyz/rules.minify.hash";
const CACHE_DURATION_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
type CachedRules = {
data: ClearURLsRules;
hash: string;
cachedAt: number;
expiresAt: number;
};
export class RulesCache extends DurableObject {
async getRules() {
try {
const cached = await this.ctx.storage.get<CachedRules>("rules");
if (cached && Date.now() < cached.expiresAt) {
return cached.data;
}
console.log("Fetching fresh rules from ClearURLs");
return await this.fetchAndCacheRules();
} catch (error) {
console.error("Error getting rules:", error);
// Try to return cached rules even if expired as fallback
const cached = await this.ctx.storage.get<CachedRules>("rules");
if (cached) {
console.log("Falling back to expired cached rules");
return cached.data;
}
throw new Error("Failed to get rules and no cached fallback available");
}
}
private async fetchAndCacheRules() {
const [rulesResponse, hashResponse] = await Promise.all([fetch(RULES_URL), fetch(HASH_URL)]);
if (!rulesResponse.ok) {
throw new Error(`Failed to fetch rules: ${rulesResponse.status}`);
}
if (!hashResponse.ok) {
throw new Error(`Failed to fetch hash: ${hashResponse.status}`);
}
const [rulesText, expectedHash] = await Promise.all([rulesResponse.text(), hashResponse.text()]);
const actualHash = await this.calculateSHA256(rulesText);
if (actualHash !== expectedHash.trim()) {
throw new Error(`Hash validation failed. Expected: ${expectedHash.trim()}, Actual: ${actualHash}`);
}
const rules = JSON.parse(rulesText) as ClearURLsRules;
const now = Date.now();
const cachedRules: CachedRules = {
data: rules,
hash: actualHash,
cachedAt: now,
expiresAt: now + CACHE_DURATION_MS,
};
await this.ctx.storage.put("rules", cachedRules);
console.log(`Cached rules with hash: ${actualHash}`);
return rules;
}
private async calculateSHA256(text: string) {
const encoder = new TextEncoder();
const data = encoder.encode(text);
const hashBuffer = await crypto.subtle.digest("SHA-256", data);
const hashArray = Array.from(new Uint8Array(hashBuffer));
return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("");
}
}

14
src/types.ts Normal file
View File

@@ -0,0 +1,14 @@
export type ClearURLsProvider = {
urlPattern: string;
completeProvider?: boolean;
rules?: string[];
rawRules?: string[];
referralMarketing?: string[];
exceptions?: string[];
redirections?: string[];
forceRedirection?: boolean;
};
export type ClearURLsRules = {
providers: Record<string, ClearURLsProvider>;
};