import requests
from bs4 import BeautifulSoup
import pdfplumber
import os
import json
import re
import tempfile
from datetime import datetime
from django.conf import settings
from django.core.files.base import ContentFile
from django.db import transaction
from .models import BlacklistRecord, ScrapeLog
import nepali_datetime
from concurrent.futures import ThreadPoolExecutor

def _process_batch(args):
    page_range, tmp_pdf_path = args
    print(f"  [Worker] Starting batch {page_range[0]} - {page_range[-1]}...", flush=True)
    results = []
    try:
        import pdfplumber
        with pdfplumber.open(tmp_pdf_path) as inner_pdf:
            for page_idx in page_range:
                if page_idx >= len(inner_pdf.pages): break
                page = inner_pdf.pages[page_idx]
                table = page.extract_table()
                if table:
                    for row in table:
                        if not row or len(row) < 5: continue
                        sno_str = str(row[0]).strip()
                        if not sno_str or not sno_str.isdigit(): continue
                        bl_no = str(row[1]).strip()
                        if not bl_no or bl_no.lower() == 'blacklist no.': continue
                        results.append({
                            'idx': page_idx,
                            'sno': int(sno_str),
                            'bl_no': bl_no,
                            'bl_date_bs': str(row[2]).strip(),
                            'borrower_name': str(row[3]).strip(),
                            'associated_raw': str(row[4]).strip() if row[4] else ""
                        })
    except Exception as e:
        print(f"Error in batch {page_range}: {e}")
    return results

def scrape_blacklist():
    # We will create the log at the end as requested
    records_count = 0
    pdf_url = ""
    file_name = ""
    pdf_content = None
    
    try:
        base_url = "https://cibnepal.org.np"
        response = requests.get(base_url, timeout=30)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the "Blacklist Downloads" link
        link_tag = None
        for a in soup.find_all('a', href=True):
            if 'blacklist' in a['href'].lower() and '.pdf' in a['href'].lower():
                link_tag = a
                break
        
        if not link_tag:
            print("Error: Could not find Blacklist Downloads link on CIB website.", flush=True)
            return False

        pdf_url = link_tag['href']
        if not pdf_url.startswith('http'):
            pdf_url = base_url + "/" + pdf_url.lstrip('/')
            
        print(f"Found Blacklist PDF URL: {pdf_url}", flush=True)
        file_name = pdf_url.split('/')[-1]

        print("Downloading PDF...", flush=True)
        pdf_response = requests.get(pdf_url, timeout=60)
        if pdf_response.status_code != 200:
            print(f"Error: Failed to download PDF (Status Code: {pdf_response.status_code})", flush=True)
            return False
        
        pdf_content = pdf_response.content
        print(f"PDF downloaded successfully. Size: {len(pdf_content) / 1024 / 1024:.2f} MB", flush=True)

        print("Saving to temporary file for processing...", flush=True)
        # Save to a temporary file for processing
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
            tmp_pdf.write(pdf_content)
            tmp_pdf_path = tmp_pdf.name

        try:
            with pdfplumber.open(tmp_pdf_path) as pdf:
                total_pages = len(pdf.pages)
                print(f"Total pages in PDF: {total_pages}", flush=True)
                
                # --- SMART CHECK ---
                print("Performing smart check to detect new data...", flush=True)
                check_pages = [0, total_pages - 1, total_pages // 2]
                new_found = False
                for p_idx in check_pages:
                    if p_idx >= total_pages: continue
                    page = pdf.pages[p_idx]
                    table = page.extract_table()
                    if table:
                        for row in table[:10]:
                            if len(row) >= 2:
                                bl_no = str(row[1]).strip()
                                if bl_no and not bl_no.lower() == 'blacklist no.':
                                    if not BlacklistRecord.objects.filter(blacklist_no=bl_no).exists():
                                        new_found = True
                                        break
                    if new_found: break
                
                if not new_found and BlacklistRecord.objects.exists():
                    print("Smart check result: No new records found. Skipping full scan.", flush=True)
                    with transaction.atomic():
                        log = ScrapeLog.objects.create(
                            status='Success',
                            file_name=file_name,
                            file_url=pdf_url,
                            records_extracted=0,
                            message="No new records detected. Processing skipped."
                        )
                        log.file_path.save(file_name, ContentFile(pdf_content), save=True)
                    return True

                print("Smart check result: New records detected. Starting full extraction...", flush=True)

                # --- PARALLEL PROCESSING ---
                from concurrent.futures import ThreadPoolExecutor, as_completed
                
                batch_size = 20
                num_batches = (total_pages + batch_size - 1) // batch_size
                batches = [(range(i * batch_size, min((i + 1) * batch_size, total_pages)), tmp_pdf_path) for i in range(num_batches)]
                
                print(f"Starting parallel extraction in {num_batches} batches...")
                all_extracted_data = []
                
                with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as executor:
                    futures = {executor.submit(_process_batch, b): i for i, b in enumerate(batches)}
                    
                    completed_batches = 0
                    for future in as_completed(futures):
                        batch_idx = futures[future]
                        try:
                            batch_results = future.result()
                            all_extracted_data.extend(batch_results)
                            completed_batches += 1
                            processed_pages = min(completed_batches * batch_size, total_pages)
                            if completed_batches % 5 == 0 or processed_pages == total_pages:
                                print(f"Progress: ~{processed_pages}/{total_pages} pages processed ({(processed_pages/total_pages*100):.1f}%)", flush=True)
                        except Exception as e:
                            print(f"Error in batch {batch_idx}: {e}")
                
                # Sort by original SN to maintain order if possible
                all_extracted_data.sort(key=lambda x: x['idx'])
                print(f"Extraction complete. Total potential records extracted: {len(all_extracted_data)}", flush=True)

                # --- BULK CREATE ---
                print("Filtering duplicates and preparing database entries...", flush=True)
                with transaction.atomic():
                    # Create log first so we can link records
                    log = ScrapeLog.objects.create(
                        status='Success',
                        file_name=file_name,
                        file_url=pdf_url,
                        message="Processing records..."
                    )
                    log.file_path.save(file_name, ContentFile(pdf_content), save=False)
                    
                    to_create = []
                    seen_bl_nos = set() # Avoid duplicates in the same PDF
                    
                    # Fetch existing blacklist numbers to avoid duplicates
                    existing_nos = set(BlacklistRecord.objects.values_list('blacklist_no', flat=True))
                    
                    for item in all_extracted_data:
                        if item['bl_no'] in existing_nos or item['bl_no'] in seen_bl_nos:
                            continue
                        
                        seen_bl_nos.add(item['bl_no'])
                        
                        bl_date_ad = None
                        try:
                            date_parts = re.split(r'[-/]', item['bl_date_bs'])
                            if len(date_parts) == 3:
                                y, m, d = int(date_parts[0]), int(date_parts[1]), int(date_parts[2])
                                np_date = nepali_datetime.date(y, m, d)
                                bl_date_ad = np_date.to_datetime_date()
                        except: pass

                        associated_names = [name.strip() for name in item['associated_raw'].split(',') if name.strip()]
                        associated_list = [{"name": n, "type": "Person"} for n in associated_names]

                        to_create.append(BlacklistRecord(
                            blacklist_no=item['bl_no'],
                            sno=item['sno'],
                            blacklist_date_bs=item['bl_date_bs'],
                            blacklist_date_ad=bl_date_ad,
                            borrower_name=item['borrower_name'],
                            associated_entities=json.dumps(associated_list),
                            scrape_log=log
                        ))
                    
                    if to_create:
                        print(f"Saving {len(to_create)} new records to database...", flush=True)
                        BlacklistRecord.objects.bulk_create(to_create, batch_size=1000)
                        records_count = len(to_create)
                    else:
                        print("No new unique records to save.", flush=True)
                    
                    log.records_extracted = records_count
                    log.message = f"Successfully extracted {records_count} new records"
                    log.save()
                    print(f"Scraping session finished. Status: Success, New Records: {records_count}", flush=True)
                    
                return True
        finally:
            if os.path.exists(tmp_pdf_path):
                os.remove(tmp_pdf_path)

    except Exception as e:
        import traceback
        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
        print(error_msg)
        ScrapeLog.objects.create(
            status='Failed',
            file_name=file_name,
            file_url=pdf_url,
            message=error_msg[:1000] # Truncate message
        )
        return False
