#!/usr/bin/env python3
"""
Reads `backlinks_raw.txt`, extracts unique domains, and writes
`disavow_candidates.txt` in Google Disavow format (one `domain:example.com` per line).
"""
import sys
from urllib.parse import urlparse

IN = 'backlinks_raw.txt'
OUT = 'disavow_candidates.txt'

try:
    with open(IN, 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f if l.strip()]
except FileNotFoundError:
    print(f"{IN} not found. Place your URLs in {IN} and run this script.")
    sys.exit(1)

domains = set()
for url in lines:
    try:
        p = urlparse(url)
        net = p.netloc.lower()
        if not net:
            continue
        # remove possible credentials and ports
        if '@' in net:
            net = net.split('@')[-1]
        if ':' in net:
            net = net.split(':')[0]
        # strip www.
        if net.startswith('www.'):
            net = net[4:]
        domains.add(net)
    except Exception:
        continue

if not domains:
    print('No domains extracted.')
    sys.exit(1)

sorted_domains = sorted(domains)
with open(OUT, 'w', encoding='utf-8') as f:
    f.write('# Disavow candidate file generated by extract_domains.py\n')
    f.write('# Review before submission to Google Search Console (https://search.google.com/search-console/disavow-links)\n')
    for d in sorted_domains:
        f.write(f'domain:{d}\n')

print(f'Wrote {len(sorted_domains)} unique domains to {OUT}')