import argparse
import urllib.request
import struct
import lzma
import csv
import os
import sys
import datetime
import concurrent.futures
import time

# Dukascopy Base URL
BASE_URL = 'https://datafeed.dukascopy.com/datafeed'

def get_divisor(symbol):
    symbol = symbol.upper()
    
    # Normalization (ensure we match the input format or normalized format)
    # The script normalizes input before calling fetch_day, but get_divisor is called on raw input usually.
    # Let's handle both.
    s = symbol.replace('/', '').replace('.', '').replace('-', '')
    
    # JPY check
    if 'JPY' in s:
        return 1000
    
    # SPECIAL CASE: Divisor 10 for High Value/Low Precision Raw
    if s in ['BRENTCMDUSD', 'LIGHTCMDUSD', 'WTI', 'BRENT', 'USOIL', 'UKOIL', 'BTCUSD', 'ETHUSD', 'BCHUSD']:
        return 10

    # 1. Metals (DB: Metals)
    metals = [
        'XAGAUD', 'XAGEUR', 'XAGGBP', 'XAGUSD',
        'XAUAUD', 'XAUEUR', 'XAUGBP', 'XAUUSD'
    ]
    if s in metals:
        return 1000

    # 2. Commodities (DB: Commodities)
    commodities = [
        'BRENTCMDUSD', 'LIGHTCMDUSD', 
        'WTI', 'BRENT', 'USOIL', 'UKOIL', 'NGAS', 'COPPER' # Legacy/Generic
    ]
    if s in commodities:
        return 1000

    # 3. Indices (DB: Indices)
    indices = [
        'AUSIDXAUD', 'DEUIDXEUR', 'DOLLARIDXUSD', 'EUSIDXEUR', 'FRAIDXEUR', 
        'GBRIDXGBP', 'HKGIDXHKD', 'JPNIDXJPY', 'USA30IDXUSD', 'USA500IDXUSD', 'USATECHIDXUSD',
        'US30', 'DJI', 'NAS100', 'NDX', 'SPX500', 'SPX', 'DAX', 'GER30', 'UK100', 'FTSE', 'JP225' # Legacy
    ]
    if s in indices:
        return 1000

    # 4. Stocks (DB: Stocks)
    # Generated from DB list
    stocks = [
        'AABVUSUSD', 'AAPLUSUSD', 'ABTUSUSD', 'ADBEUSUSD', 'AMDUSUSD', 'AMGNUSUSD', 'AMTUSUSD', 'AMZNUSUSD', 
        'AVGOUSUSD', 'BAGBGBX', 'BABAUSUSD', 'BACUSUSD', 'BIDUUSUSD', 'BIIBUSUSD', 'BMYUSUSD', 'CUSUSD', 
        'CMCSAUSUSD', 'CMEUSUSD', 'COSTUSUSD', 'CSCOUSUSD', 'CSXUSUSD', 'CVSUSUSD', 'EBAYUSUSD', 'FUSUSD', 
        'FBUSUSD', 'FTNTUSUSD', 'GILDUSUSD', 'GOOGLUSUSD', 'HDUSUSD', 'IBMUSUSD', 'INTCUSUSD', 'INTUUSUSD', 
        'ISRGUSUSD', 'JNJUSUSD', 'JPMUSUSD', 'KOUSUSD', 'LIFREUR', 'LLYUSUSD', 'LMTUSUSD', 'MAUSUSD', 
        'MCDUSUSD', 'MDLZUSUSD', 'MMMUSUSD', 'MOUSUSD', 'MRKDEEUR', 'MSITEUR', 'MSFTUSUSD', 'NFLXUSUSD', 
        'NIOUSUSD', 'NKEUSUSD', 'NVDAUSUSD', 'ORCLUSUSD', 'PEPUSUSD', 'PFEUSUSD', 'PGUSUSD', 'PMUSUSD', 
        'PYPLUSUSD', 'REGNUSUSD', 'SBUXUSUSD', 'TUSUSD', 'TMOUSUSD', 'TMUSUSUSD', 'TSLAUSUSD', 'TSMUSUSD', 
        'UNHUSUSD', 'UPSUSUSD', 'VUSUSD', 'VRTXUSUSD', 'VZUSUSD', 'WFCUSUSD', 'WMTUSUSD', 'XOMUSUSD', 
        'XPEVUSUSD', 'YUMCUSUSD'
    ]
    if s in stocks:
        return 1000
        
    # Crypto regex check
    if any(s.startswith(x) for x in ['BTC', 'ETH', 'LTC', 'BCH', 'XRP', 'SOL', 'ADA', 'DOGE', 'BNB', 'DOT', 'MATIC', 'LINK']):
        return 1000
        
    return 100000

def normalize_symbol(symbol):
    return symbol.upper().replace('/', '').replace('.', '').replace('-', '')

def parse_args():
    parser = argparse.ArgumentParser(description='Fetch Dukascopy Historical Data')
    parser.add_argument('--symbol', required=True, help='Asset Symbol (e.g. EURUSD)')
    parser.add_argument('--asset-id', required=True, type=int, help='Asset ID in Database')
    parser.add_argument('--start-year', required=True, type=int, help='Start Year')
    parser.add_argument('--end-year', required=True, type=int, help='End Year')
    parser.add_argument('--output', required=True, help='Output CSV File')
    return parser.parse_args()

def fetch_day(symbol, date_obj, divisor):
    year = date_obj.strftime('%Y')
    month = str(int(date_obj.strftime('%m')) - 1).zfill(2) # 0-indexed month
    day = date_obj.strftime('%d')
    
    url = f"{BASE_URL}/{symbol}/{year}/{month}/{day}/BID_candles_min_1.bi5"
    
    try:
        req = urllib.request.Request(url)
        with urllib.request.urlopen(req, timeout=10) as response:
            content = response.read()
            if not content:
                return []
            
            # Decompress
            try:
                binary = lzma.decompress(content)
            except lzma.LZMAError:
                # Try raw lzma decompress if standard fails (sometimes headerless)
                # But lzma.decompress usually handles .bi5 correctly
                return []
                
            length = len(binary)
            # 24 bytes per row: Time(4), Open(4), Close(4), Low(4), High(4), Vol(4) -> Float/Int
            # Actually struct is: >IiIIIf (BigEndian: Int, Int, Int, Int, Int, Float) ?? 
            # PHP code used: unpack('Ntime/Nopen/Nclose/Nlow/Nhigh/Gvol', $chunk);
            # N = unsigned long (always 32 bit, big endian byte order)
            # G = float (machine dependent size, big endian byte order) -> This is actually 'f' in python Struct usually, but need to check 'G' equivalent.
            # PHP 'G' is "float (machine dependent size, big endian byte order)". Python struct '>' is big endian. 'f' is standard float (4 bytes).
            
            rows = []
            day_timestamp = int(date_obj.replace(tzinfo=datetime.timezone.utc).timestamp())
            
            # Struct format: >IIIIIf ('N' is 4 byte unsigned int, 'G' is 4 byte float big endian?)
            # Let's assume standard float is 4 bytes.
            struct_fmt = '>IIIIIf'
            row_size = 24
            
            for i in range(0, length, row_size):
                chunk = binary[i:i+row_size]
                if len(chunk) < row_size:
                    break
                
                data = struct.unpack(struct_fmt, chunk)
                # data = (time_offset, open, close, low, high, vol)
                
                time_offset = data[0]
                price_open = data[1] / divisor
                price_close = data[2] / divisor
                price_low = data[3] / divisor
                price_high = data[4] / divisor
                volume = data[5]
                
                if volume <= 0.0000001:
                    continue
                
                timestamp = day_timestamp + time_offset
                created_at = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                
                # 'period' is 'm1'
                rows.append((
                    'm1',
                    datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
                    f"{price_open:.5f}",
                    f"{price_high:.5f}",
                    f"{price_low:.5f}",
                    f"{price_close:.5f}",
                    f"{volume:.2f}",
                    created_at,
                    created_at
                ))
            return rows
            
    except urllib.error.HTTPError as e:
        # 404 is normal for weekends/holidays or empty data
        return []
    except Exception as e:
        # print(f"Error fetching {url}: {e}")
        return []

def main():
    args = parse_args()
    
    symbol_norm = normalize_symbol(args.symbol)
    divisor = get_divisor(args.symbol)
    
    start_date = datetime.datetime(args.start_year, 1, 1)
    end_date = datetime.datetime(args.end_year, 12, 31)
    
    current_date = start_date
    dates = []
    while current_date <= end_date:
        # Skip weekends to save requests? Dukascopy might have files but empty.
        # Generally forex is mon-fri. Crypto is 24/7.
        # Safe to check all days but optimization: skip Sat/Sun for Forex?
        # Let's check all days to be safe for now, parallelism handles speed.
        dates.append(current_date)
        current_date += datetime.timedelta(days=1)
    
    print(f"Fetching {len(dates)} days for {args.symbol}...")
    
    # Write Header
    with open(args.output, 'w', newline='') as f:
        pass # Clear file
        
    # Process in chunks to append to CSV
    chunk_size = 100
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        for i in range(0, len(dates), chunk_size):
            chunk_dates = dates[i:i+chunk_size]
            future_to_date = {executor.submit(fetch_day, symbol_norm, date, divisor): date for date in chunk_dates}
            
            chunk_rows = []
            for future in concurrent.futures.as_completed(future_to_date):
                 res = future.result()
                 if res:
                     chunk_rows.extend(res)
            
            # Sort by timestamp to be nice? Not strictly necessary for DB load, but good for CSV debug.
            # actually database insertion order doesn't strictly matter if we don't rely on it.
            
            if chunk_rows:
                # Sort chunk memory
                chunk_rows.sort(key=lambda x: x[1]) 
                
                with open(args.output, 'a', newline='') as f:
                    # Use CSV writer to handle escaping if needed (though standard data shouldn't need it)
                    # We output raw CSV format compatible with LOAD DATA LOCAL INFILE
                    # csv.writer defaults to ',' delimiter.
                    
                    # Columns: asset_id, period, timestamp, open, high, low, close, volume, created_at, updated_at
                    # Note: We computed rows above without asset_id. Let's add it now.
                    
                    row_fmt = []
                    asset_id = str(args.asset_id)
                    for r in chunk_rows:
                         # r = (period, timestamp, o, h, l, c, v, cr, up)
                         row_fmt.append([asset_id] + list(r))
                    
                    writer = csv.writer(f)
                    writer.writerows(row_fmt)
            
            print(f"Processed batch {i} to {min(i+chunk_size, len(dates))}")

if __name__ == "__main__":
    main()
