import json
import time
import datetime
import logging
import argparse
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from stem import Signal
from stem.control import Controller
from currency_converter import CurrencyConverter

currencies = {'$': 'USD', '€': 'EUR', '£': 'GBP', '₹': 'INR'}
logger = logging.getLogger("gsmarena-scraper")
currency_conv = CurrencyConverter()

class tor_network:
    def __init__(self):
        self.session = requests.session()
        self.session.proxies = {
            "http": "socks5h://localhost:9050",
            "https": "socks5h://localhost:9050",
        }
        self.ntries = 0

    def get_soup(self, url):
        time.sleep(1)
        while True:
            try:
                self.ntries += 1
                soup = BeautifulSoup(
                    self.session.get(url).content, features="lxml"
                )
                if soup.find("title").text.lower() == "too many requests":
                    logger.info(f"Too many requests.")
                    self.request_new_ip()
                elif soup or self.ntries > 30:
                    self.ntries = 0
                    break
                logger.debug(
                    f"Try {self.ntries} : Problem with soup for {url}."
                )
            except Exception as e:
                # sys.exit(f"Can't extract webpage {url}: {e}")
                logger.warning(f"Can't extract webpage {url}: {e}")
                return None
        return soup

    def request_new_ip(self):
        logger.info("Requesting new ip address.")
        with Controller.from_port(port=9051) as controller:
            controller.authenticate(password="my password")
            controller.signal(Signal.NEWNYM)
        self.session = requests.session()
        self.session.proxies = {
            "http": "socks5h://localhost:9050",
            "https": "socks5h://localhost:9050",
        }
        self.ntries = 0


def extract_smartphone_infos(network, smartphone):
    smartphone_dict = dict()
    smartphone = smartphone.find("a")
    url_smartphone = f"https://www.gsmarena.com/{requests.utils.quote(str(smartphone['href']))}"
    # url_smartphone = f"https://www.gsmarena.com/{requests.utils.quote(smartphone)}"

    logger.debug("url_smartphone : %s", url_smartphone)
    soup_smartphone = network.get_soup(url_smartphone)
    if soup_smartphone is None:
        return
    smartphone_dict["device_name"] = str(
        soup_smartphone.find("h1").find_all(text=True, recursive=False)[0]
    )
    logger.info(f"Processing device {smartphone_dict['device_name']}")

    # parse SPEC_VERSIONS.
    for script in soup_smartphone.find_all("script"):
        if script.string:
            if script.string.strip().startswith("var SPEC_VERSIONS"):
                spec_versions_str = script.string.strip().replace("var SPEC_VERSIONS = [[\n", '')
                spec_versions_str = spec_versions_str.replace("]];", '')
                smartphone_dict["spec_versions"] = to_spec_versions(spec_versions_str)

    release = soup_smartphone.find("span", {"data-spec": "released-hl"})
    if release:
        smartphone_dict["release_date"] = release.text.strip()

    if soup_smartphone.select("table"):
        for spec_table in soup_smartphone.find_all("table"):
            category = spec_table.find("th")
            if category:
                spec_category = category.text.strip()
            else:
                continue
            for spec_row in spec_table.find_all("tr"):
                if spec_row.find("td", {"class": "ttl"}) and spec_row.find("td", {"class": "ttl"}).a:
                    spec_key = spec_row.find("td", {"class": "ttl"}).a.text.strip()
                else:
                    continue
                spec_value = spec_row.find("td", {"class": "nfo"}).get_text(separator="; ").strip()
                smartphone_dict[spec_category + "_" + spec_key] = spec_value
    else:
        logger.error("%s : table not found", smartphone_dict["device_name"])
        network.request_new_ip()

    soup_smartphone.decompose()
    return smartphone_dict


def extract_device_metadata(smartphone_dict):
    device_metadata_dicts = []

    release_date = None
    if "release_date" in smartphone_dict:
        if smartphone_dict["release_date"].startswith("Released"):
            release_info = smartphone_dict["release_date"].removeprefix("Released ")
            release_date = to_date(release_info)

    price = None
    if "Misc_Price" in smartphone_dict:
        price = to_price(smartphone_dict["Misc_Price"])

    device_metadata_dict = dict()
    visited_models = set()
    # set device_name as device_model but remove the brand name at begin.
    model = smartphone_dict["device_name"]
    device_name_splits = smartphone_dict["device_name"].split(" ", 1)
    if len(device_name_splits) > 1:
        model = device_name_splits[1]

    device_metadata_dict["device_model"] = model
    device_metadata_dict["device_name"] = smartphone_dict["device_name"]
    device_metadata_dict["release_date"] = release_date
    device_metadata_dict["chipset"] = smartphone_dict[
        "Platform_Chipset"] if "Platform_Chipset" in smartphone_dict else None
    device_metadata_dict["cpu"] = smartphone_dict["Platform_CPU"] if "Platform_CPU" in smartphone_dict else None
    device_metadata_dict["gpu"] = smartphone_dict["Platform_GPU"] if "Platform_GPU" in smartphone_dict else None
    device_metadata_dict["analog_mic_jack"] = to_analog_mic_jack(smartphone_dict["Sound_3.5mm jack"]) if "Sound_3.5mm jack" in smartphone_dict else None
    device_metadata_dict["price_usd"] = round(price, 2) if price is not None else None
    device_metadata_dict["display_resolution"] = smartphone_dict["Display_Resolution"] if "Display_Resolution" in smartphone_dict else None

    maybe_add_to_device_metadata_dicts(device_metadata_dicts, device_metadata_dict, visited_models)

    # deal with motorola device_model is like: `moto g(7) play`.
    if device_name_splits[0] == "Motorola":
        moto_name_splits = device_name_splits[1].lower().split()
        if len(moto_name_splits) > 1 and len(moto_name_splits[1]) > 1:
            if '0' <= moto_name_splits[1][1] <= '9':
                moto_name_splits[1] = moto_name_splits[1][0] + '(' + moto_name_splits[1][1:] + ')'
                device_metadata_dict["device_model"] = ' '.join([str(s) for s in moto_name_splits])
                device_metadata_dicts.append(device_metadata_dict.copy())

    # check spec_versions.
    if "spec_versions" in smartphone_dict:
        for version, spec in smartphone_dict["spec_versions"].items():
            device_metadata_dict_copy = device_metadata_dict.copy()
            parse_version_spec(device_metadata_dict_copy, version, spec)
            if device_name_splits[0] == "Samsung":
                device_metadata_dict_copy["device_model"] = "sm-" + device_metadata_dict_copy["device_model"]
            maybe_add_to_device_metadata_dicts(device_metadata_dicts, device_metadata_dict_copy, visited_models)
            if "modelname" in spec:
                device_metadata_dict_copy["device_model"] = spec["modelname"].lower().replace(device_name_splits[0].lower(), '').strip()
                maybe_add_to_device_metadata_dicts(device_metadata_dicts, device_metadata_dict_copy, visited_models)

    # check misc_models.
    if "Misc_Models" in smartphone_dict:
        for model in smartphone_dict["Misc_Models"].split(", "):
            device_metadata_dict["device_model"] = model
            maybe_add_to_device_metadata_dicts(device_metadata_dicts, device_metadata_dict, visited_models)

    return device_metadata_dicts


def to_analog_mic_jack(analog_mic_jack_str):
    if analog_mic_jack_str.lower() in ("yes", "true", "y", "t"):
        return "TRUE"
    if analog_mic_jack_str.lower() in ("no", "false", "n", "f"):
        return "FALSE"
    return None


def to_price(price_str):
    if price_str.startswith("About"):
        price_info = price_str.split(" ")
        if len(price_info) == 3:
            return currency_conv.convert(float(price_info[1]), price_info[2], 'USD')
    else:
        for price_info in price_str.split("/"):
            price_info = price_info.strip().replace('\u2009', '')
            if price_info[0] in currencies:
                return currency_conv.convert(float(price_info[1:].replace(",", "")), currencies[price_info[0]], 'USD')


def to_date(date_str):
    d = date_str.split(" ")
    try:
        if len(d) == 3:
            return datetime.datetime.strptime(date_str, '%Y, %B %d')
        elif len(d) == 2:
            if d[1] == "Q1":
                return datetime.datetime.strptime(d[0] + " " + "January", '%Y, %B')
            elif d[1] == "Q2":
                return datetime.datetime.strptime(d[0] + " " + "April", '%Y, %B')
            elif d[1] == "Q3":
                return datetime.datetime.strptime(d[0] + " " + "July", '%Y, %B')
            elif d[1] == "Q4":
                return datetime.datetime.strptime(d[0] + " " + "October", '%Y, %B')
            return datetime.datetime.strptime(date_str, '%Y, %B')
        elif len(d) == 1:
            return datetime.datetime.strptime(date_str, '%Y')
    except Exception as e:
        logger.warning("can't convert release date: %s", e)
    return None


def to_spec_versions(spec_versions_str):
    spec_versions_info = spec_versions_str.split("],")
    spec_versions = dict()
    for spec_version in spec_versions_info:
        spec_version_info = spec_version.strip().split('",', 1)
        if len(spec_version_info) != 2:
            continue
        version = spec_version_info[0].strip().replace('["', '')
        spec_str = spec_version_info[1].strip().replace(',\n}', "\n}")
        spec = json.loads(spec_str)
        spec_versions[version] = spec
    return spec_versions


def parse_version_spec(device_metadata_dict, version, spec):
    device_metadata_dict["device_model"] = version.lower()
    device_metadata_dict["chipset"] = spec["chipset"] if "chipset" in spec else device_metadata_dict["chipset"]
    device_metadata_dict["cpu"] = spec["cpu"] if "cpu" in spec else device_metadata_dict["cpu"]
    device_metadata_dict["gpu"] = spec["gpu"] if "gpu" in spec else device_metadata_dict["gpu"]


def maybe_add_to_device_metadata_dicts(device_metadata_dicts, device_metadata_dict, visited_models):
    model = device_metadata_dict["device_model"].lower()
    if model not in visited_models:
        device_metadata_dicts.append(device_metadata_dict.copy())
        visited_models.add(model)


def extract_brand_name(brand):
    return brand["href"].rsplit("-", 1)[0]


# extract_brand_infos returns all device metadata of the given brand.
def extract_brand_infos(network, brand):
    index_page = 1
    brand = brand["href"].rsplit("-", 1)
    brand_name = str(brand[0])
    brand_id = str(brand[1].split(".")[0])
    logger.info(f"Processing brand {brand_name}")
    url_brand_base = f"https://www.gsmarena.com/{brand_name}-f-{brand_id}-0"
    device_metadata_list = []

    while True:
        url_brand_page = f"{url_brand_base}-p{index_page}.php"
        logger.debug(url_brand_page)
        index_page = index_page + 1
        logger.debug(f"Page URL : {url_brand_page}")
        soup_page = network.get_soup(url_brand_page)
        if soup_page is None:
            return device_metadata_list

        if soup_page.find("div", {"class": "section-body"}).select("li"):
            smartphones = soup_page.find(
                "div", {"class": "section-body"}
            ).find_all("li")
            soup_page.decompose()
            for smartphone in smartphones:
                smartphone_dict = extract_smartphone_infos(network, smartphone)
                if smartphone_dict is None:
                    continue
                device_metadata_dicts = extract_device_metadata(smartphone_dict)
                if device_metadata_dicts and len(device_metadata_dicts) > 0:
                    device_metadata_list.extend(device_metadata_dicts)
        else:
            soup_page.decompose()
            logger.warning(
                "%s : td class=section-body not found", url_brand_page
            )
            return device_metadata_list


def main():
    args = parse_args()

    network = tor_network()

    url_index = "https://www.gsmarena.com/makers.php3"
    soup_index = network.get_soup(url_index)
    if soup_index is None:
        return

    brands = soup_index.find("div", {"class": "st-text"}).find_all("a")
    soup_index.decompose()
    Path("Exports").mkdir(parents=True, exist_ok=True)
    Path("Exports/Brands").mkdir(parents=True, exist_ok=True)

    global_list_smartphones = pd.DataFrame()
    for brand in brands:
        brand_name = extract_brand_name(brand)
        brand_export_file = f"Exports/Brands/{brand_name}_export.csv"
        # If file doesn't already exists, extract smartphone information.
        if not Path(brand_export_file).is_file():
            brand_dict = pd.DataFrame.from_records(
                extract_brand_infos(network, brand)
            )
            brand_dict.to_csv(brand_export_file, sep=";", index=False)
            global_list_smartphones = pd.concat(
                [global_list_smartphones, brand_dict], sort=False
            )
        # Otherwise, read the file.
        else:
            logger.warning(
                "Skipping %s, %s already exists. Its content will be added to the global export file.",
                brand_name,
                brand_export_file,
            )
            try:
                brand_dict = pd.read_csv(brand_export_file, sep=";")
            except Exception as e:
                if str(e) != "No columns to parse from file":
                    logger.error("parse file %s error: %s", brand_export_file, e)
                    return
            global_list_smartphones = pd.concat(
                [global_list_smartphones, brand_dict], sort=False
            )
    all_export_file = "Exports/all_brands_export.csv"
    logger.info("Exporting all devices metadata to %s.", all_export_file)
    global_list_smartphones.to_csv(all_export_file, sep=";", index=False)


def parse_args():
    parser = argparse.ArgumentParser(description="Scraper gsmarena.")
    parser.add_argument(
        "--debug",
        help="Display debugging information",
        action="store_const",
        dest="loglevel",
        const=logging.DEBUG,
        default=logging.INFO,
    )
    args = parser.parse_args()

    logger.setLevel(args.loglevel)
    handler = logging.StreamHandler()
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)
    return args


def test(smartphone):
    network = tor_network()
    smartphone_dict = extract_smartphone_infos(network, smartphone)
    device_metadata_dicts = extract_device_metadata(smartphone_dict)
    print(device_metadata_dicts)


if __name__ == "__main__":
    main()
    # test("xiaomi_redmi_note_7-9513.php")
