#!/usr/bin/env python
# -*- coding: utf-8 -*-

import logging
import sys
import urllib2
import json
import hashlib
from optparse import OptionParser
import sys
import time
import re
import os

import multipart

LOG = logging.getLogger('sites')


def index_sites(documents_dir, documents_name, indexer_url, timeout):
    path = os.path.abspath(documents_dir)
    files = sorted(os.listdir(path))

    data_name_re = re.compile(documents_name)
    j = 0
    for i in files:
        if data_name_re.match(i):
            f = open(os.path.join(path, i), 'r')

            # документы выкаченные zoracl имеют вид
            # 1ый элемент url, 2ой длина документа, 3ий cам документ
            docs = f.read().splitlines(True)
            url = docs[0].strip()
            document_length = int(docs[1].strip())
            document = ''.join(map(str, docs[2:]))
            if len(document) != document_length:
                print 'Bad document'

            # json-message
            json_message = json.dumps({
                'send_type': 'modify',  # index type
                'keyprefix': 1,
                'url': hashlib.md5('%s' % url).hexdigest(), # document url
                'mime_type': 'text/html',
                'charset': 'utf8',
                'document_attributes': [{'name': 'url', 'value': url},],
            })

            data = {'json_message': json_message,
                'document': document}
            body, headers = multipart.encode(data, {})
            request = urllib2.Request(multipart.smart_str(indexer_url), body, headers)
            k = 0
            while True:
                try:
                    res = urllib2.urlopen(request, timeout=timeout)
                    print k
                    k += 1
                    j += 1
                    break
                except urllib2.HTTPError, e:
                    if e.code not in [500, 502, 503, 504]:
                        LOG.error('Document failed with error: code = %s, response = %s; '
                        'message: headers %s, body %s', e.code, e.read(), headers, body)
                        break
                    else:
                        LOG.warn('Document failed with error: code = %s, response = %s', e.code, e.read())
                        time.sleep(2)

                except urllib2.URLError, e:
                    LOG.warn('Document failed with error: %s, %s', e, e.reason)
                except Exception, e:
                    LOG.error('Some strange error %s, message: headers %s, body %s', e, headers, body, exc_info=sys.exc_info())
                    break


if __name__ == '__main__':
    parser = OptionParser()
    parser.add_option("-c", "--config", dest="config", help="address to config for index site documents")

    options, _ = parser.parse_args()

    if not options.config:
        parser.error("option -c required")

    config = json.loads(file(options.config).read())

    if config.get('logname'):
        log_writer = logging.FileHandler(config.get('logname'))
    else:
        log_writer = logging.StreamHandler(sys.stdout)

    format = '[%(process)s %(thread)d] %(asctime)s %(levelname)s %(name)s: %(message)s'
    f = logging.Formatter(format)
    log_writer.setFormatter(f)

    LOG.setLevel(logging.DEBUG)
    logging.getLogger().addHandler(log_writer)

    index_sites(config.get('documents_dir'), config.get('documents_name'), config.get('indexer_url'), config.get('timeout'))

