# coding: utf-8



import logging
from urllib.parse import urlparse
import re
from lxml import etree

from django.utils.encoding import force_text

from .base import RepoTuple, RepoCrawler


log = logging.getLogger(__name__)


class GitWebCrawler(RepoCrawler):
    PATTERN = re.compile(r'/gitweb/(.+)\.git')

    def ensure_valid_url(self, url):
        parse_result = urlparse(url)
        if not parse_result.hostname:
            parsed_source_url = urlparse(self.source.web_url)
            return parse_result._replace(
                netloc=parsed_source_url.netloc, scheme=parsed_source_url.scheme
            ).geturl()
        return url

    def get_repos(self, **kwargs):
        log.info('Going to "%s"', self.source.web_url)

        response = self.session.get(url=self.source.web_url, timeout=5)
        response.raise_for_status()

        log.info('Parsing "%s"', self.source.web_url)

        doc = etree.fromstring(response.content, parser=etree.HTMLParser())

        for node in doc.xpath("//table[@class='project_list']/"
                              "tr[@class='light' or @class='dark']"):
            tds = node.xpath('./td')
            desc_link = tds[1].xpath('./a')[0]

            summary_url = node.xpath('./td[@class="link"]/a/@href')[0]
            summary_url = self.ensure_valid_url(summary_url)

            log.info('The url in gitweb is "%s"', summary_url)

            summary_resp = self.session.get(url=summary_url, timeout=5)
            if not summary_resp.ok:
                default_branch = 'master'
            else:
                summary_xml = etree.fromstring(summary_resp.content,
                                               parser=etree.HTMLParser())
                branch = summary_xml.xpath('//td[@class="current_head"]/a/text()')
                if not branch:
                    default_branch = 'master'
                else:
                    default_branch = branch[0]

            has_tree = any([a.text == 'tree' for a in tds[-1].xpath('./a')])
            if not has_tree:
                continue

            parsed_link = urlparse(desc_link.attrib['href'])

            log.info('Parsed link is "%s"', parsed_link.path)

            m = self.PATTERN.match(parsed_link.path)

            if not m:
                continue

            name = m.group(1)

            log.info('Repository name is "%s"', name)

            yield RepoTuple(
                name=force_text(self.normalize_name(name)),
                owner='root',
                description=force_text(desc_link.text or ''),
                vcs_name=force_text(name + '.git'),
                default_branch=force_text(default_branch),
                is_public=True,
            )

    def get_repo_url(self, repo):
        return '%s/%s' % (self.source.web_url, repo.vcs_name)

    def get_commit_url(self, repo, id_):
        return self.get_repo_url(repo) + '?a=commit&h=%s' % id_

