import codecs
import sys

import requests
import io
import gzip
import xml.sax
import argparse
import traceback
import io
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

USER_AGENT = 'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)'
REQUEST_TIMEOUT = 90
REQUEST_RETRIES = 5


class SitemapUrlExtractor(xml.sax.ContentHandler):
    def __init__(self):
        self.urls = set()
        self.sitemaps = set()
        self.intag = False
        self.mode = None

    def startElement(self, name, attrs):
        if self.mode is None:
            self.mode = name
        else:
            self.intag = (name == 'loc')

        self.content = ''

    def characters(self, content):
        if self.intag:
            self.content += content

    def endElement(self, name):
        if self.intag:
            url = self.content.strip()
            if url:
                if self.mode == 'sitemapindex':
                    self.sitemaps.add(url)
                elif self.mode == 'urlset':
                    self.urls.add(url)

        self.intag = False

    def get_urls(self):
        return self.urls

    def get_sitemaps(self):
        return self.sitemaps


def requests_retry_session(
        retries=REQUEST_RETRIES,
        status_forcelist=(500, 502, 504)):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def fetch_with_requests(sitemap_url):
    print "Downloading " + str(sitemap_url)
    session = requests_retry_session()
    response = session.get(sitemap_url, stream=True, timeout=REQUEST_TIMEOUT, headers={'User-Agent': USER_AGENT})

    return response.status_code, response.headers, response.content


def save_sitemap_urls(urls, path):
    with io.open(path, mode="a", encoding="utf-8") as outfile:
        for url in urls:
            outfile.write(url + "\n")

def process_sitemap(sitemap_url):
    print "Processing sitemap: " + str(sitemap_url)
    status_code, headers, content = fetch_with_requests(sitemap_url)

    if status_code >= 400:
        raise Exception("Bad status code: " + str(status_code))

    print "Extracting content from " + str(sitemap_url)

    # detecting gzipped files
    if content.startswith(b'\x1f\x8b'):
        with io.BytesIO(content) as f:
            content = gzip.GzipFile(fileobj=f).read()

    content_type = ''
    if 'content-type' in headers:
        content_type = headers['content-type'].split(';')[0]
        if content_type not in ('text/plain', 'text/xml'):
            if content.startswith('<'):
                content_type = 'text/xml'
            else:
                content_type = 'text/plain'

    sitemaps, urls = [], []
    if content_type == 'text/plain':
        lines = [line.strip() for line in content.split('\n')]
        urls = [line for line in lines if line]
    elif content_type == 'text/xml':
        extractor = SitemapUrlExtractor()
        try:
            xml.sax.parseString(content, extractor)
        except Exception as e:
            # There's python issue which prevents exception thrown from parseString
            # to be handled correctly by multiprocessing.Pool. So rethrow as simpler
            # exception which does not cause the problem
            raise RuntimeError("Cannot parse xml: {} (content type: {}, content: \"{}...\")".format(str(e), content_type, content[:64]))
        sitemaps = extractor.get_sitemaps()
        urls = extractor.get_urls()

    print "Got " + str(len(urls)) + " urls, " + str(len(sitemaps)) + " sitemaps from " + str(sitemap_url)
    print "Saving urls for " + str(sitemap_url)
    save_sitemap_urls(urls, '/Users/leonidrom/src/tmp/sitemap/sm_urls.txt')
    for sm in sitemaps:
        process_sitemap(sm)


if __name__ == "__main__":
    UTF8Writer = codecs.getwriter('utf8')
    sys.stdout = UTF8Writer(sys.stdout)
    process_sitemap("https://seafood-shop.ru/sitemap.xml")
