import argparse, json, sys, urllib, urllib2, re, urlparse
import StringIO

from PIL import Image, ImageStat

URL_FIELD_NAME = 'url'
MAX_DEVIATION_CUTOFF = 100


def json_find(path, JSON, deep_search):
    result = JSON

    if deep_search:
        for key in path.split('.'):
            result = result[key]
    else:
        result = result[path]

    return result


def iri2uri(iri, encoding='utf8'):
    "Takes a Unicode string that can contain an IRI and emits a URI."
    scheme, authority, path, query, frag = urlparse.urlsplit(iri)
    scheme = scheme.encode(encoding)
    if ":" in authority:
        host, port = authority.split(":", 1)
        authority = host.encode('idna') + ":%s" % port
    else:
        authority = authority.encode('idna')
    path = urllib.quote(
      path.encode(encoding),
      safe="/;%[]=:$&()+,!?*@'~"
    )
    query = urllib.quote(
      query.encode(encoding),
      safe="/;%[]=:$&()+,!?*@'~"
    )
    frag = urllib.quote(
      frag.encode(encoding),
      safe="/;%[]=:$&()+,!?*@'~"
    )
    return urlparse.urlunsplit((scheme, authority, path, query, frag))


def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def create_argument_parser():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--url_field_name',
        required=True,
    )

    parser.add_argument(
        '--max_deviation_cutoff',
        type=int,
        required=True,
    )

    parser.add_argument(
        '--try_count',
        type=int,
        required=True,
    )

    parser.add_argument(
        '--input',
        required=True,
    )

    parser.add_argument(
        '--output',
        required=True,
    )

    parser.add_argument(
        '--errors',
        required=True,
    )

    parser.add_argument(
        '--algorithm_version',
        required=False,
        default='v1'
    )

    parser.add_argument(
        '--deep_search',
        type=str2bool,
        required=False,
        default=False,
        const=True,
        nargs="?"
    )
    parser.add_argument(
        '--url_timeout',
        required=False,
        type=int,
        default=10
    )

    return parser


def download_image(url, try_count, timeout):
    if not url:
        return None

    cur_try = 0
    while cur_try < try_count:
        cur_try += 1
        try:
            response = urllib2.urlopen(url, timeout=timeout)
        except IOError:
            continue
        except Exception as e:
            print >>sys.stderr, repr(e)
            continue
        if response.getcode() != 200:
            continue
        return response

    return None


def cut_image_border(img, percent):
    if percent < 0.1:
        return img
    x, y = img.size
    x_min = int(x / 100.0 * percent)
    y_min = int(y / 100.0 * percent)
    return img.crop((x_min, y_min, x - x_min, y - y_min))


def detect_color_image(url, thumb_size=40, cutoff=100, try_count=1, timeout=10, algorithm='v1'):
    try:
        response = download_image(iri2uri(url), try_count, timeout)
    except (IOError, UnicodeError, TypeError):
        url = url.encode('utf-8')
        sys.stderr.write('Error while downloading picture {}\n'.format(url))
        return {
            'url': url,
            'errors': 'Error while downloading picture'
        }

    url = url.encode('utf-8')

    if response is None:
        sys.stderr.write('Could not download picture {}\n'.format(url))
        return {
            'url': url,
            'errors': 'Could not download picture'
        }

    try:
        pil_img = Image.open(StringIO.StringIO(response.read()))
    except (IOError, ValueError):
        sys.stderr.write('Could not read image {}\n'.format(url))
        return {
            'url': url,
            'errors': 'Could not read image'
        }

    try:
        pil_img = pil_img.convert('RGB')
    except IOError:
        sys.stderr.write('Could not convert image to RGB {}\n'.format(url))
        return {
            'url': url,
            'errors': 'Could not convert image to RGB'
        }

    if algorithm == 'v1':
        thumb = pil_img.resize((thumb_size, thumb_size), Image.BICUBIC)
        current_max = 0
        for pixel in thumb.getdata():
            mu = sum(pixel) / float(3)
            current_max = max(sum((pixel[i] - mu) * (pixel[i] - mu) for i in [0, 1, 2]), current_max)

        sys.stderr.write('IMAGE OK: {}\n'.format(url))
        return {
            'url': url,
            'is_gray_scale': current_max < cutoff,
            'max_gray_deviation': current_max,
        }
    elif algorithm == 'v2':
        bordered = cut_image_border(pil_img, 15)
        thumb = bordered.resize((thumb_size, thumb_size), Image.BICUBIC)
        current_sum = 0
        cnt = 0
        for pixel in thumb.getdata():
            mu = sum(pixel) / float(3)
            current_sum += sum((pixel[i] - mu) * (pixel[i] - mu) for i in [0, 1, 2])
            cnt += 1

        if cnt == 0:
            sys.stderr.write('IMAGE IS EMPTY: {}\n'.format(url))
            avg_gray_deviation = 0
        else:
            sys.stderr.write('IMAGE OK: {}\n'.format(url))
            avg_gray_deviation = current_sum / cnt

        return {
            'url': url,
            'is_gray_scale': avg_gray_deviation < cutoff,
            'avg_gray_deviation': avg_gray_deviation
        }

    else:
        sys.stderr.write('Unknown algorithm {}\n'.format(algorithm))
        return {
            'url': url,
            'errors': 'Unknown algorithm mode'
        }


def main():
    args = create_argument_parser().parse_args()

    global URL_FIELD_NAME
    global MAX_DEVIATION_CUTOFF

    URL_FIELD_NAME = args.url_field_name
    MAX_DEVIATION_CUTOFF = args.max_deviation_cutoff

    sys.stderr.write("Algorithm version {}\n".format(args.algorithm_version))

    with open(args.input) as input_file:
        image_list = json.load(input_file)

    output_list = []
    errors_list = []

    total_images = len(image_list)

    for idx, image in enumerate(image_list):
        sys.stderr.write('{} / {}, '.format(idx+1, total_images))
        output_dict = detect_color_image(json_find(URL_FIELD_NAME, image, args.deep_search),
                                        cutoff=MAX_DEVIATION_CUTOFF, try_count=args.try_count,
                                        timeout=args.url_timeout, algorithm=args.algorithm_version)
        if 'errors' in output_dict:
            errors_list.append(output_dict)
        else:
            output_list.append(output_dict)

    with open(args.output, 'w') as output_file:
        print >> output_file, json.dumps(output_list, indent=4, ensure_ascii=False)

    with open(args.errors, 'w') as errors_file:
        print >> errors_file, json.dumps(errors_list, indent=4, ensure_ascii=False)


if __name__ == '__main__':
    main()



