#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
import sys
import os
import json
import codecs
import argparse
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup


def check_link(link):
    return (
        not link.startswith('Deletion') and not link.startswith('Featured') and
        link.endswith(('.ogv', '.webm'))
    )


def main():
    videos = []
    with codecs.open('allogv.txt', 'r', 'utf8') as f:
        for line in f:
            tabs = line.strip().split('\t')
            if len(tabs) < 2:
                continue
            if check_link(tabs[1]):
                videos.append(tabs[1])

    with codecs.open('allwebm.txt', 'r', 'utf8') as f:
        for line in f:
            tabs = line.strip().split('\t')
            if len(tabs) < 2:
                continue
            if check_link(tabs[1]):
                videos.append(tabs[1])

    print(len(videos))

    files = []
    bad = []

    for v in tqdm(videos):
        retries = 0
        cont = False
        req = None
        while (req is None and retries < 5):
            try:
                req = requests.get(
                    'https://commons.wikimedia.org/wiki/File:{}'.format(v)
                )
                if req.status_code == 404:
                    cont = True
                else:
                    cont = False
            except Exception as e:
                print('Exception: {}'.format(e))
                cont = True
        if cont:
            continue
        soup = BeautifulSoup(req.content, 'html5lib')
        links = [
            l for l in soup.find_all('a') if l.get_text() in {
                'Original file', v
            }
        ]
        if not links:
            print('link not found for {}'.format(v))
            bad.append(v)
            continue
        else:
            print('found link for {}'.format(v))
        info = links[0].find_next('span', {'class': 'fileInfo'})
        if info:
            info = info.get_text()
        files.append(
            {
                'link': links[0]['href'],
                'info': info
            }
        )
        if len(files) % 100 == 0:
            json.dump(
                files,
                codecs.open('video_info.json', 'w', 'utf8'),
                indent=2, ensure_ascii=False, sort_keys=True
            )




if __name__ == "__main__":
    main()
