implement url cleaning based on ClearURLs rules and logic
This commit is contained in:
241
src/cleaner.ts
Normal file
241
src/cleaner.ts
Normal file
@@ -0,0 +1,241 @@
|
||||
import type { ClearURLsRules } from "./types";
|
||||
|
||||
export async function cleanUrl(inputUrl: string, rules: ClearURLsRules, maxRedirects = 5, visited = new Set<string>()): Promise<string> {
|
||||
try {
|
||||
const currentUrl = new URL(inputUrl);
|
||||
|
||||
if (visited.has(currentUrl.href) || visited.size >= maxRedirects) {
|
||||
return cleanUrlParameters(currentUrl, rules.providers);
|
||||
}
|
||||
|
||||
visited.add(currentUrl.href);
|
||||
|
||||
// Check for ClearURLs redirections first
|
||||
const clearUrlsRedirect = checkClearUrlsRedirections(currentUrl.href, rules.providers);
|
||||
if (clearUrlsRedirect && clearUrlsRedirect !== currentUrl.href) {
|
||||
return await cleanUrl(clearUrlsRedirect, rules, maxRedirects, visited);
|
||||
}
|
||||
|
||||
// Then check for HTTP redirects
|
||||
const redirectTarget = await followRedirect(currentUrl.href);
|
||||
if (redirectTarget && redirectTarget !== currentUrl.href) {
|
||||
return await cleanUrl(redirectTarget, rules, maxRedirects, visited);
|
||||
}
|
||||
|
||||
return cleanUrlParameters(currentUrl, rules.providers);
|
||||
} catch (error) {
|
||||
console.error(`Error caught when trying to clean url ${inputUrl}`, error);
|
||||
return inputUrl;
|
||||
}
|
||||
}
|
||||
|
||||
async function followRedirect(url: string): Promise<string | null> {
|
||||
// @ts-ignore - Skip redirect following in tests to avoid external HTTP calls
|
||||
if (typeof global !== "undefined" && global.process?.env?.NODE_ENV === "test") {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: "HEAD",
|
||||
redirect: "manual",
|
||||
});
|
||||
|
||||
if (response.status >= 300 && response.status < 400) {
|
||||
const location = response.headers.get("Location");
|
||||
if (location) {
|
||||
// Handle relative redirects
|
||||
if (location.startsWith("/")) {
|
||||
const baseUrl = new URL(url);
|
||||
return `${baseUrl.protocol}//${baseUrl.host}${location}`;
|
||||
}
|
||||
return location;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function cleanUrlParameters(url: URL, providers: ClearURLsRules["providers"]): string {
|
||||
const matchingProvider = findMatchingProvider(url.href, providers);
|
||||
|
||||
if (!matchingProvider) {
|
||||
return url.href;
|
||||
}
|
||||
|
||||
if (matchingProvider.completeProvider === true) {
|
||||
throw new Error("URL blocked by ClearURLs rules");
|
||||
}
|
||||
|
||||
if (matchingProvider.exceptions && isException(url.href, matchingProvider.exceptions)) {
|
||||
return url.href;
|
||||
}
|
||||
|
||||
if (matchingProvider.rules && matchingProvider.rules.length > 0) {
|
||||
url = cleanParametersByRules(url, matchingProvider.rules);
|
||||
url = cleanFragmentsByRules(url, matchingProvider.rules);
|
||||
}
|
||||
|
||||
if (matchingProvider.rawRules && matchingProvider.rawRules.length > 0) {
|
||||
let cleaned = url.href;
|
||||
for (const rawRule of matchingProvider.rawRules) {
|
||||
try {
|
||||
cleaned = cleaned.replace(new RegExp(rawRule, "gi"), "");
|
||||
} catch (error) {
|
||||
console.warn(`Invalid raw rule regex: ${rawRule}`, error);
|
||||
}
|
||||
}
|
||||
try {
|
||||
url = new URL(cleaned);
|
||||
} catch (error) {
|
||||
console.warn("Raw rule produced invalid URL, skipping", error);
|
||||
}
|
||||
}
|
||||
|
||||
return url.href;
|
||||
}
|
||||
|
||||
function findMatchingProvider(url: string, providers: ClearURLsRules["providers"]) {
|
||||
const { globalRules, ...otherProviders } = providers;
|
||||
|
||||
for (const [providerName, provider] of Object.entries(otherProviders)) {
|
||||
try {
|
||||
const regex = new RegExp(provider.urlPattern);
|
||||
if (regex.test(url)) {
|
||||
return provider;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Invalid URL pattern for provider ${providerName}: ${provider.urlPattern}`, error);
|
||||
}
|
||||
}
|
||||
|
||||
return globalRules;
|
||||
}
|
||||
|
||||
function cleanParametersByRules(url: URL, rules: string[]) {
|
||||
const params = new URLSearchParams(url.search);
|
||||
const cleanParams = new URLSearchParams();
|
||||
|
||||
for (const [key, value] of params) {
|
||||
let shouldRemove = false;
|
||||
|
||||
for (const rule of rules) {
|
||||
try {
|
||||
if (new RegExp("^" + rule + "$", "gi").test(key)) {
|
||||
shouldRemove = true;
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Invalid rule regex: ${rule}`, error);
|
||||
}
|
||||
}
|
||||
|
||||
if (!shouldRemove) {
|
||||
cleanParams.set(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
url.search = cleanParams.toString();
|
||||
return url;
|
||||
}
|
||||
|
||||
function cleanFragmentsByRules(url: URL, rules: string[]) {
|
||||
const fragments = extractFragments(url);
|
||||
const cleanFragments = new Map<string, string | null>();
|
||||
|
||||
for (const [key, value] of fragments) {
|
||||
let shouldRemove = false;
|
||||
|
||||
for (const rule of rules) {
|
||||
try {
|
||||
if (new RegExp("^" + rule + "$", "gi").test(key)) {
|
||||
shouldRemove = true;
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Invalid rule regex: ${rule}`, error);
|
||||
}
|
||||
}
|
||||
|
||||
if (!shouldRemove) {
|
||||
cleanFragments.set(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
url.hash = fragmentsToString(cleanFragments);
|
||||
return url;
|
||||
}
|
||||
|
||||
function extractFragments(url: URL): Map<string, string | null> {
|
||||
const fragments = new Map<string, string | null>();
|
||||
const hash = url.hash.slice(1); // Remove the #
|
||||
|
||||
if (!hash) return fragments;
|
||||
|
||||
const params = hash.split("&");
|
||||
for (const p of params) {
|
||||
const param = p.split("=");
|
||||
if (!param[0]) continue;
|
||||
|
||||
const key = param[0];
|
||||
let value: string | null = null;
|
||||
if (param.length === 2 && param[1]) {
|
||||
value = param[1];
|
||||
}
|
||||
fragments.set(key, value);
|
||||
}
|
||||
|
||||
return fragments;
|
||||
}
|
||||
|
||||
function fragmentsToString(fragments: Map<string, string | null>): string {
|
||||
const parts: string[] = [];
|
||||
for (const [key, value] of fragments) {
|
||||
if (value !== null) {
|
||||
parts.push(key + "=" + value);
|
||||
} else {
|
||||
parts.push(key);
|
||||
}
|
||||
}
|
||||
return parts.length > 0 ? parts.join("&") : "";
|
||||
}
|
||||
|
||||
function isException(url: string, exceptions: string[]): boolean {
|
||||
for (const exception of exceptions) {
|
||||
try {
|
||||
const regex = new RegExp(exception, "i");
|
||||
if (regex.test(url)) {
|
||||
return true;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Invalid exception regex: ${exception}`, error);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function checkClearUrlsRedirections(url: string, providers: ClearURLsRules["providers"]): string | null {
|
||||
const matchingProvider = findMatchingProvider(url, providers);
|
||||
|
||||
if (!matchingProvider || !matchingProvider.redirections) {
|
||||
return null;
|
||||
}
|
||||
|
||||
for (const redirectionPattern of matchingProvider.redirections) {
|
||||
try {
|
||||
const regex = new RegExp(redirectionPattern, "i");
|
||||
const match = url.match(regex);
|
||||
if (match && match[1]) {
|
||||
// First capture group is the target URL
|
||||
return decodeURIComponent(match[1]);
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Invalid redirection regex: ${redirectionPattern}`, error);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
46
src/index.ts
46
src/index.ts
@@ -1,18 +1,36 @@
|
||||
/**
|
||||
* Welcome to Cloudflare Workers! This is your first worker.
|
||||
*
|
||||
* - Run `npm run dev` in your terminal to start a development server
|
||||
* - Open a browser tab at http://localhost:8787/ to see your worker in action
|
||||
* - Run `npm run deploy` to publish your worker
|
||||
*
|
||||
* Bind resources to your worker in `wrangler.jsonc`. After adding bindings, a type definition for the
|
||||
* `Env` object can be regenerated with `npm run cf-typegen`.
|
||||
*
|
||||
* Learn more at https://developers.cloudflare.com/workers/
|
||||
*/
|
||||
import { cleanUrl } from "./cleaner";
|
||||
import { RulesCache } from "./rules-cache";
|
||||
|
||||
type Env = {
|
||||
RULES_CACHE: DurableObjectNamespace<RulesCache>;
|
||||
};
|
||||
|
||||
export { RulesCache };
|
||||
|
||||
export default {
|
||||
async fetch(request, env, ctx): Promise<Response> {
|
||||
return new Response('Hello World!');
|
||||
async fetch(request, env, _ctx): Promise<Response> {
|
||||
const url = new URL(request.url);
|
||||
const targetUrl = url.searchParams.get("url");
|
||||
|
||||
if (!targetUrl) {
|
||||
return new Response("Missing url parameter", { status: 400 });
|
||||
}
|
||||
|
||||
try {
|
||||
const rulesStub = env.RULES_CACHE.getByName("rules");
|
||||
const rules = await rulesStub.getRules();
|
||||
|
||||
const cleanedUrl = await cleanUrl(targetUrl, rules);
|
||||
return new Response(cleanedUrl, {
|
||||
headers: {
|
||||
"Content-Type": "text/plain",
|
||||
"Access-Control-Allow-Origin": "*",
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
return new Response(`Error processing URL: ${error instanceof Error ? error.message : "Unknown error"}`, {
|
||||
status: 500,
|
||||
});
|
||||
}
|
||||
},
|
||||
} satisfies ExportedHandler<Env>;
|
||||
|
77
src/rules-cache.ts
Normal file
77
src/rules-cache.ts
Normal file
@@ -0,0 +1,77 @@
|
||||
import { DurableObject } from "cloudflare:workers";
|
||||
import type { ClearURLsRules } from "./types";
|
||||
|
||||
const RULES_URL = "https://rules2.clearurls.xyz/data.minify.json";
|
||||
const HASH_URL = "https://rules2.clearurls.xyz/rules.minify.hash";
|
||||
const CACHE_DURATION_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
|
||||
|
||||
type CachedRules = {
|
||||
data: ClearURLsRules;
|
||||
hash: string;
|
||||
cachedAt: number;
|
||||
expiresAt: number;
|
||||
};
|
||||
|
||||
export class RulesCache extends DurableObject {
|
||||
async getRules() {
|
||||
try {
|
||||
const cached = await this.ctx.storage.get<CachedRules>("rules");
|
||||
|
||||
if (cached && Date.now() < cached.expiresAt) {
|
||||
return cached.data;
|
||||
}
|
||||
|
||||
console.log("Fetching fresh rules from ClearURLs");
|
||||
return await this.fetchAndCacheRules();
|
||||
} catch (error) {
|
||||
console.error("Error getting rules:", error);
|
||||
|
||||
// Try to return cached rules even if expired as fallback
|
||||
const cached = await this.ctx.storage.get<CachedRules>("rules");
|
||||
if (cached) {
|
||||
console.log("Falling back to expired cached rules");
|
||||
return cached.data;
|
||||
}
|
||||
|
||||
throw new Error("Failed to get rules and no cached fallback available");
|
||||
}
|
||||
}
|
||||
|
||||
private async fetchAndCacheRules() {
|
||||
const [rulesResponse, hashResponse] = await Promise.all([fetch(RULES_URL), fetch(HASH_URL)]);
|
||||
if (!rulesResponse.ok) {
|
||||
throw new Error(`Failed to fetch rules: ${rulesResponse.status}`);
|
||||
}
|
||||
if (!hashResponse.ok) {
|
||||
throw new Error(`Failed to fetch hash: ${hashResponse.status}`);
|
||||
}
|
||||
|
||||
const [rulesText, expectedHash] = await Promise.all([rulesResponse.text(), hashResponse.text()]);
|
||||
const actualHash = await this.calculateSHA256(rulesText);
|
||||
if (actualHash !== expectedHash.trim()) {
|
||||
throw new Error(`Hash validation failed. Expected: ${expectedHash.trim()}, Actual: ${actualHash}`);
|
||||
}
|
||||
|
||||
const rules = JSON.parse(rulesText) as ClearURLsRules;
|
||||
const now = Date.now();
|
||||
const cachedRules: CachedRules = {
|
||||
data: rules,
|
||||
hash: actualHash,
|
||||
cachedAt: now,
|
||||
expiresAt: now + CACHE_DURATION_MS,
|
||||
};
|
||||
|
||||
await this.ctx.storage.put("rules", cachedRules);
|
||||
console.log(`Cached rules with hash: ${actualHash}`);
|
||||
|
||||
return rules;
|
||||
}
|
||||
|
||||
private async calculateSHA256(text: string) {
|
||||
const encoder = new TextEncoder();
|
||||
const data = encoder.encode(text);
|
||||
const hashBuffer = await crypto.subtle.digest("SHA-256", data);
|
||||
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
||||
return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("");
|
||||
}
|
||||
}
|
14
src/types.ts
Normal file
14
src/types.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
export type ClearURLsProvider = {
|
||||
urlPattern: string;
|
||||
completeProvider?: boolean;
|
||||
rules?: string[];
|
||||
rawRules?: string[];
|
||||
referralMarketing?: string[];
|
||||
exceptions?: string[];
|
||||
redirections?: string[];
|
||||
forceRedirection?: boolean;
|
||||
};
|
||||
|
||||
export type ClearURLsRules = {
|
||||
providers: Record<string, ClearURLsProvider>;
|
||||
};
|
Reference in New Issue
Block a user