#!/usr/bin/env python3 """ Reads `backlinks_raw.txt`, extracts unique domains, and writes `disavow_candidates.txt` in Google Disavow format (one `domain:example.com` per line). """ import sys from urllib.parse import urlparse IN = 'backlinks_raw.txt' OUT = 'disavow_candidates.txt' try: with open(IN, 'r', encoding='utf-8') as f: lines = [l.strip() for l in f if l.strip()] except FileNotFoundError: print(f"{IN} not found. Place your URLs in {IN} and run this script.") sys.exit(1) domains = set() for url in lines: try: p = urlparse(url) net = p.netloc.lower() if not net: continue # remove possible credentials and ports if '@' in net: net = net.split('@')[-1] if ':' in net: net = net.split(':')[0] # strip www. if net.startswith('www.'): net = net[4:] domains.add(net) except Exception: continue if not domains: print('No domains extracted.') sys.exit(1) sorted_domains = sorted(domains) with open(OUT, 'w', encoding='utf-8') as f: f.write('# Disavow candidate file generated by extract_domains.py\n') f.write('# Review before submission to Google Search Console (https://search.google.com/search-console/disavow-links)\n') for d in sorted_domains: f.write(f'domain:{d}\n') print(f'Wrote {len(sorted_domains)} unique domains to {OUT}')