# -*- coding: utf-8 -*-

import re
import logging
import requests
from requests.auth import HTTPBasicAuth
from multiprocessing.dummy import Pool
import json

logger = logging.getLogger(__name__)

API = "https://api.github.yandex-team.ru/"
DEFAULT_USER = 'oko-robot'
QUERY = "filename:package.json extension:json language:JSON dependencies org:"
PROJECT_EXTRACTOR = re.compile(r"^https://github\.yandex-team\.ru/(.+)/(.+)/blob/\w+/(.*package\.json)$")
NUM_THREADS = 8


class GithubClient:
    token = None
    user = DEFAULT_USER

    def set_token(self, token, user=DEFAULT_USER):
        self.token = token
        self.user = user

    def get(self, *args, **kwargs):
        if self.token:
            kwargs['auth'] = HTTPBasicAuth(self.user, self.token)
        return requests.get(*args, **kwargs)

gh_client = GithubClient()


class GithubCrawler(object):
    @staticmethod
    def get_files(token):
        gh_client.set_token(token)

        organizations = GithubCrawler._get_organizations()
        urls = GithubCrawler._get_urls(organizations)
        files = GithubCrawler._get_files(urls)
        return files

    @staticmethod
    def _get_organizations():
        logger.debug("Getting organizations")
        result = []
        url = API + "organizations"

        r = gh_client.get(url)
        r.raise_for_status()

        last_id = None
        for org in r.json():
            result.append(org["login"])
            last_id = org["id"]

        while r.json():
            r = gh_client.get(url, {"since": last_id})
            r.raise_for_status()

            for org in r.json():
                result.append(org["login"])
                last_id = org["id"]

        logger.info("Got %s organizations", len(result))
        return result

    @staticmethod
    def _get_urls(organizations):
        logger.debug(
            "Getting urls %s",
            json.dumps(organizations, ensure_ascii=False, sort_keys=True, indent=2),
        )
        result = []

        for org in organizations:
            result.extend(GithubCrawler._get_urls_for_org(org))

        logger.info("Got %s urls", len(result))
        return result

    @staticmethod
    def _get_files(urls):
        pool = Pool(NUM_THREADS)
        logger.debug(
            "Getting files %s",
            json.dumps(urls, ensure_ascii=False, sort_keys=True, indent=2),
        )
        result = list(filter(None, pool.map(GithubCrawler._get_file, urls)))
        pool.terminate()
        logger.info("Got %s files", len(result))
        return result

    @staticmethod
    def _get_file(url):
        r = gh_client.get(url.split("?")[0])
        if r.status_code != requests.codes.ok:
            return None

        html_url = r.json()["html_url"]
        download_url = r.json()["download_url"]

        match = PROJECT_EXTRACTOR.match(html_url)
        if match is None:
            return None

        repo = "{}/{}".format(match.group(1), match.group(2))

        repo_data = gh_client.get("https://api.github.yandex-team.ru/repos/{}".format(repo)).json()

        if repo_data.get("archived") is True:
            return None

        commit = gh_client.get("https://api.github.yandex-team.ru/repos/{}/commits?per_page=1".format(repo))

        last_commit_time = commit.json()[0]["commit"]["author"]["date"]
        package = gh_client.get(download_url).text
        lock_variant = "package-lock.json"
        package_lock = GithubCrawler._get_lock_file(url, download_url, lock_variant)
        if package_lock is None:
            logger.debug("package-lock.json for url not found, trying yarn.lock")
            lock_variant = "yarn.lock"
            package_lock = GithubCrawler._get_lock_file(url, download_url, lock_variant)

        if package_lock is None:
            logger.debug("No lock for %s", url)
            lock_variant = None

        return {
            "url": html_url,
            "path": r.json().get("path", ""),
            "file": package,
            "lock_file": package_lock,
            "lock_variant": lock_variant,
            "project": repo,
            "last_commit_time": last_commit_time,
            "vcs_type": "github",
        }

    @staticmethod
    def _get_lock_file(package_url, download_url, lock_file):
        url = package_url.replace("package.json", lock_file).split("?")[0]
        r = gh_client.get(url)
        if r.status_code != requests.codes.ok:
            logger.debug(
                "%s %s not found, %s; trying workaround",
                lock_file,
                url,
                r.status_code,
            )
            workaround_url = download_url.replace("package.json", lock_file)
            r = gh_client.get(workaround_url)
            if r.status_code != requests.codes.ok:
                logger.debug("workaround failed, %s", r.status_code)
                return None

            logger.debug("workaround success for %s, %s", workaround_url, r.status_code)
            return r.text

        download_url = r.json()["download_url"]
        r = gh_client.get(download_url)
        if r.status_code != requests.codes.ok:
            logger.debug("trouble with downloading %s %s", lock_file, download_url)
            return None
        return r.text

    @staticmethod
    def _get_urls_for_org(org):
        url = API + "search/code"
        result = []

        r = gh_client.get(url, {"q": GithubCrawler._get_search_query(org), "per_page": 100})

        if "items" in r.json():
            for item in r.json()["items"]:
                result.append(item["url"])

        while GithubCrawler._get_next_link(r) is not None:
            r = gh_client.get(GithubCrawler._get_next_link(r))
            r.raise_for_status()

            for item in r.json()["items"]:
                result.append(item["url"])

        # logger.debug("_get_urls_for_org: %s %s", org, result)
        return result

    @staticmethod
    def _get_search_query(org):
        return QUERY + org

    @staticmethod
    def _get_next_link(r):
        headers = r.headers

        if headers.get("Link") is None:
            return None

        regex = re.compile(r'<([^>]*)>; rel="next"')
        links = headers.get("Link").split(", ")
        for link in links:
            match = regex.search(link)
            if match is not None:
                return match.group(1)

        return None

    @staticmethod
    def check_root_package(url):
        match = PROJECT_EXTRACTOR.match(url)
        if match is None:
            return False

        return match.group(3) == "package.json"
