#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import requests as req

from collections import defaultdict
from bs4 import BeautifulSoup, NavigableString
from defusedxml.ElementTree import fromstring

from commit_search import CommitSearch


class VulnCrawl(object):
    blog_url = "https://www.blogger.com/feeds/8982037438137564684/posts/default"
    post_id_str = "postID="
    atom_link = "{http://www.w3.org/2005/Atom}link"
    atom_content = "{http://www.w3.org/2005/Atom}content"
    atom_title = "{http://www.w3.org/2005/Atom}title"
    channels = {"stable_desktop": "Stable Channel Update for Desktop"}
    cr_log_url_re = re.compile(r"https://chromium.googlesource.com/chromium/src/\+log/", re.IGNORECASE)
    crbug_re = re.compile(r"https://crbug\.com/\d+", re.IGNORECASE)
    severity_re = re.compile("Critical|High|Medium|Low", re.IGNORECASE)
    severity = {"Critical": 3, "High": 2, "Medium": 1, "Low": 0}
    cve_re = re.compile(r"CVE-\d+-\d+", re.IGNORECASE)
    product_re = re.compile(r"v8|pdfium", re.IGNORECASE)

    def __init__(self, max_res, min_sever="Medium", release=None, channel="stable_desktop"):
        self.max_res = max_res
        self.release = release
        self.channel = channel
        self.channel_str = self.channels.get(channel)
        if not self.channel_str:
            raise NameError(channel)

        self.min_sev_level = self.severity.get(min_sever)
        if not self.min_sev_level:
            raise NameError(min_sever)

        self.vulns = defaultdict(dict)

    def _posts_feed_content(self):
        feed = None
        try:
            feed = req.get(self.blog_url, params={"max-results": self.max_res})

        except Exception as ex:
            print("[-] Failed to request {} : {}".format(feed.url, ex))
            return None

        try:
            content = fromstring(feed.content)

        except Exception as ex:
            print("[-] Failed to build etree for {}: {}".format(feed.url, ex))
            return None

        print("[+] Collected posts feed from {}".format(feed.url))
        return content

    def _extract_post_ids(self, content):
        posts_id = set()
        for child in content.getchildren():
            for el in child.iter(self.atom_link):
                if el.attrib["href"].find(self.post_id_str) > 0:
                    posts_id.add(el.attrib["href"].split(self.post_id_str)[1].split("&")[0])

        return posts_id

    def _version_from_soup(self, soup):
        ver = None
        # Extract from log url https://chromium.googlesource.com/chromium/src/+log/<prev_version>..<new_version>?pretty
        for a in soup.find_all("a", href=self.cr_log_url_re):
            ver = a["href"].split("..")[1].split("?")[0]

        return ver

    def _parse_record(self, record, version, crbug_id, url):
        try:
            cve_id = self.cve_re.search(record).group(0)
            severity = self.severity_re.search(record).group(0)
            product_search = self.product_re.search(record)
        except Exception as ex:
            print("[-] Exception while parsing {} {}".format(record, ex))
            return
        commit_url = ""
        if product_search:
            product = product_search.group(0).lower()
            cs = CommitSearch(product, "master", crbug_id)
            commit = cs.find_bug_commit()
            if commit:
                commit_url = commit.pop()
            # Chromium itself doesn't have any product substr, so check it
        else:
            product = "chromium"
            cs = CommitSearch(product, version, crbug_id)
            commit = cs.find_bug_commit()
            if commit:
                commit_url = commit.pop()

            # We still didn't find commit, lets check v8 again, sometimes v8 search has side effects
            else:
                cs = CommitSearch("v8", "master", crbug_id)
                commit = cs.find_bug_commit()
                if commit:
                    product = "v8"
                    commit_url = commit.pop()
        print({"severity": severity, "cve": cve_id, "crbug_url": url,
               "commit_url": commit_url, "product": product})
        self.vulns[version][crbug_id] = {"severity": severity, "cve": cve_id, "crbug_url": url,
                                         "commit_url": commit_url, "product": product}

    def _parse_blocks(self, id_hrefs, brs, version):
        for b in brs:
            try:
                data = b.previous.string
                if data.find("]") == 0 and len(id_hrefs) > 0 and data.find("internal audits") == -1:
                    id = id_hrefs.pop()
                    self._parse_record(data, version, id.text, id["href"])
            except AttributeError:
                continue

    def _parse_ptags(self, soup, version):
        # variant 1
        for i in soup.find_all("p"):
            if i.text.find("[") == 0:
                a = i.a
                crbug_id = a.text
                url = a["href"]
                self._parse_record(i.text, version, crbug_id, url)
        # variant 2 https://www.blogger.com/feeds/8982037438137564684/posts/default/7849219922267498677
        for i in soup.find_all("div"):
            id_hrefs = i.find_all("a")
            id_hrefs.reverse()
            br = i.find_all("br")
            self._parse_blocks(id_hrefs, br, version)

        # variant 3 https://www.blogger.com/feeds/8982037438137564684/posts/default/3952572103814780022
        id_hrefs = list(filter(lambda item: item.attrs["href"] is not None and self.crbug_re.search(item.attrs["href"]),
                               soup.find_all("a")))
        id_hrefs.reverse()
        br = soup.find_all("br")
        self._parse_blocks(id_hrefs, br, version)

    def _bugs_form_soup(self, version, soup):
        divs = soup.find_all("div")
        pps = soup.find_all("p")
        if len(pps) != 0:
            self._parse_ptags(soup, version)
            return
        if len(divs) == 0:
            print("[-] Unable to find div elements")
            self._parse_ptags(soup, version)
        else:
            for d in divs:
                self._parse_ptags(d, version)

    def _extract_bug_info(self, content):
        soup = BeautifulSoup(content, "html.parser")
        version = self._version_from_soup(soup)
        if not version:
            return None

        if self.release and version != self.release:
            return None

        self._bugs_form_soup(version, soup)

    def _parse_update_posts(self, post_ids):
        for post_id in post_ids:
            post_url = self.blog_url + "/" + post_id
            resp = req.get(post_url)
            print("[+] Parsing blog post: {}".format(post_url))
            post_data = fromstring(resp.content)
            title = post_data.find(self.atom_title)
            if not title.text or not title.text.find(self.channel_str) >= 0:
                print("Skipping post {post_id} {title}".format(post_id=post_id, title=title.text))
                continue

            content = post_data.find(self.atom_content)
            if content is None:
                print("Skipping post {post_id}, cause content is empty".format(post_id=post_id))
                continue
            print("[+] Post {post_id} {title} contains required info".format(post_id=post_id, title=title.text))
            self._extract_bug_info(content.text)

    def get_vulns(self):
        feed = self._posts_feed_content()
        post_ids = self._extract_post_ids(feed)
        self._parse_update_posts(post_ids)
        return self.vulns
