#!/usr/bin/env python

import fileinput
from sets import Set
import sys
import string
import urllib
import re
import base64
from Crypto.Cipher import AES
from Crypto import Random

document_mime_types = Set(["application/vnd.oasis.opendocument.text", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "text/html", "text/plain", "application/pdf", "application/vnd.ms-excel", "application/msword"])

archive_mime_types = Set(["application/zip", "application/x-rar", "application/x-gzip", "application/x-zip-compressed"])

def simple_type(mime_type):
    parts = mime_type.split("/")
    if parts[0] in Set(["video", "audio", "image"]):
        return parts[0]
    elif mime_type in document_mime_types:
        return "document"
    elif mime_type in archive_mime_types:
        return "archive"
    else:
        return "unknown"

"""
[09/Sep/2013:02:07:49 +0400] downloader-default1g.disk.yandex.com 85.65.5.121 "GET /rdisk/a6aebdd0094bc19b5fd5ee75680379f3/522cf533/UWsel4-G9RTZbBeA2RU8dVfrBbCAXwKkT1LOAStzRrG9AFK9OvOFa7KYlJbRf-9dqx9RpLEII5Y5H3p1RnreyQ==?uid=0&filename=LargeHoopEarrings_byIN3S.7z&disposition=attachment&hash=ly16kfO1jqlmH74/6ieKJMWpqd53jjmtxwh%2BUIL0uOo%3D&limit=0&content_type=application%2Fx-7z-compressed&rtoken=a624f3763eb2126354562c348c8f6066&rtimestamp=522cf533 HTTP/1.1" 200 "https://disk.yandex.com/public/?hash=ly16kfO1jqlmH74/6ieKJMWpqd53jjmtxwh%2BUIL0uOo%3D&locale=ru" "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36" "yandexuid=98884691378149209" "-" [proxy (-) : 127.0.0.1:10010 1.323 200 ] 881 394877 1.612

[09/Sep/2013:02:07:58 +0400] downloader-default1g.disk.yandex.com.tr 185.24.80.22 "GET /rdisk/3e4854c779cc2cdd937078f9c410e917/522cf339/o11wqE_1C0ED-IBPCTS-qyUhph36t4nAXOsema50GAvPD56anxW3IrUDUXmxPHyBD96Vfj6p4haM9yfoicOi3w==?uid=189677372&filename=2013-09-07%2022-51-24.MP4&disposition=attachment&hash=&limit=0&content_type=video%2Fmp4&rtoken=22f1c7d7bd2acb785e371816ed642c6d&rtimestamp=522cf33a HTTP/1.1" 200 "-" "Yandex.Disk {\x22os\x22:\x22windows\x22,\x22vsn\x22:\x221.0.1.3812\x22,\x22id\x22:\x229EB0FDF733FA4A6A896F80C9FED0D18C\x22}" "-" "-" [proxy (-) : 127.0.0.1:10010 515.605 200 ] 551 201261110 516.006
"""

rdisk_pattern = re.compile("^[^ ]* [^ ]* ([^ ]*) ([^ ]*) \"GET /rdisk/[^/]*/[^/]*/([^?]*)\?uid=([0-9]*)(&filename=([^& ]*)|&hash=([^& ]*)|&content_type=([^& ]*)|&[a-z_]*=[^& ]*)* HTTP/1.1\" (2[0-9][0-9]) \"[^\"]*\" \"([^ \"]*)[^\"]*\" \"[^\"]*\" \"[^\"]*\" \\[proxy [^\\]]*\\] [0-9]* ([0-9]*) ([0-9.]*)$")

stid_pattern = re.compile("^[0-9]+\\.yadisk:")

key = b'f8dbcb3d33954d623c42373e0f09a349'
iv = Random.new().read(32)
cipher = AES.new(key, AES.MODE_ECB, iv)

def coalesce(a, b):
    return b if (a is None or a == "") else a

def decipher(s):
    replaced = s.replace("-", "+").replace("_", "/")
    decoded = base64.b64decode(replaced)
    if stid_pattern.match(decoded):
        #print >>sys.stderr, "non-enciphered"
        return decoded
    deciphered = cipher.decrypt(decoded)
    depadded = deciphered.replace("{", "")
    return depadded

def printable(stid):
    for c in stid:
        if c not in string.printable:
            return False
    return True

if len(sys.argv) == 3 and sys.argv[1] == "-d":
    stid = decipher(sys.argv[2])
    print stid if printable(stid) else "non-printable"
    sys.exit(0)

print "host ip stid uid filename hash content_type simple_type status_code client size duration"
for l in fileinput.input('-'):
    ll = l.strip()
    m = rdisk_pattern.match(ll)

    if m:   
        #print ">", ll
        #print "}", m.groups()
        host = m.group(1)
        ip = m.group(2)
        stid = decipher(urllib.unquote(m.group(3)))
        if not printable(stid):
            print >>sys.stderr, "unprintable stid: [", ll, " ]"
            continue
        uid = m.group(4)
        filename = m.group(6)
        hash = coalesce(m.group(7), "-")
        content_type = m.group(8)
        status_code = m.group(9)
        client = coalesce(m.group(10), "-")
        size = m.group(11)
        duration = m.group(12)
        print host, ip, stid, uid, filename, hash, content_type, simple_type(urllib.unquote(content_type)), status_code, client, size, duration
    else:
        #print ":(", ll
        pass
