#!/usr/bin/env python
"""
usage: github_finder.py [-h] [--github GITHUB] [--language LANGUAGE]
                        [--gocode] [--blacklist BLACKLIST] [--smtp SMTP]
                        [--mail-subject MAIL_SUBJECT]
                        mailto query

Finds code in github

positional arguments:
  mailto                Email address to send report.
  query                 Search terms to look for

optional arguments:
  -h, --help            show this help message and exit
  --github GITHUB       Github base URL
  --language LANGUAGE   Programming language
  --gocode              Look only for go code.
  --blacklist BLACKLIST
                        Blacklisted repositories.
  --smtp SMTP           Mail server host.
  --mail-subject MAIL_SUBJECT
                        Email subject.

"""
from __future__ import print_function

import argparse
from collections import defaultdict
import os
import smtplib
from email.mime.text import MIMEText
import sys

import requests

import link_header


DEFAULT_GITHUB_BASE = 'https://git-aws.internal.justin.tv/api/v3/search/code'
FOUND_FIELD_NAME = '_GITHUB_FINDER_RESULTS'
GITHUB_SEARCH_LIMIT = 100


def github_session(token):
    """Return a requests session with authentication ready"""
    session = requests.Session()
    session.auth = (token, '')
    return session


def retrieve_page(session, url, search_args):
    """Given a GH results page, retrieve results and get next page.

    Will call filter functions if applicable.
    """
    req = session.get(url)
    assert req.ok
    data = req.json()
    res = filter_blacklisted(data['items'], search_args.blacklist)
    if search_args.gocode:
        res = filter_gocode(session, res, search_args.query)
    for link in link_header.parse(req.headers['link']).links:
        if link.attr_pairs == [['rel', 'next']]:
            res.extend(retrieve_page(session, link.href, search_args))
            break
    return res


def retrieve_data(session, search_args):
    """Connects to Github and starts the paginated retrieval of results."""
    url = '{}?per_page={}&q="{}"+in:file'.format(
        search_args.github, GITHUB_SEARCH_LIMIT, search_args.query)
    if search_args.language:
        url += 'language:{}'.format(search_args.language)
    return retrieve_page(session, url, search_args)


def filter_blacklisted(results, blacklisted):
    """Filters out results from blacklisted repositories"""
    return [
        result for result in results
        if result['repository']['full_name'] not in blacklisted
        ]


def filter_gocode(session, results, query):
    """Helper function that filters go code in a list of results"""
    res = []
    for result in results:
        found = find_go_code(session, result, query)
        if found:
            result[FOUND_FIELD_NAME] = found
            res.append(result)
    return res


def find_go_code(session, result, query):
    """Looks for actual go code results only.

    Filters out non go files, and comments.

    Return list of (line number, line) tuples.
    """
    # TODO: Handle multiline /* */ comments (not needed apparently)
    if not result['name'].endswith('.go'):
        return False
    req = session.get(result['url'])
    assert req.status_code == 200
    req = session.get(req.json()['download_url'])
    assert req.status_code == 200
    found = []
    for lnum, line in enumerate(req.text.split('\n'), start=1):
        line = line.strip()
        if (
                query in line and
                not line.startswith('//') and
                not line.startswith('/*')
        ):
            found.append((lnum, line))
    return found


def grouper(results):
    """Given a list of results, group them by common repo"""
    repos = defaultdict(list)
    for res in results:
        repos[tuple(res['repository']['full_name'].split('/'))].append(res)
    return repos


def mail(text, smtp_host, address, subject):
    """Sends email Kappa"""
    msg = MIMEText(text)
    msg['Subject'] = subject
    msg['From'] = address
    msg['To'] = address
    smtp_obj = smtplib.SMTP(smtp_host)
    smtp_obj.sendmail(address, [address], msg.as_string())
    return smtp_obj.quit()


def report(repos):
    """Given a list of repo dictionaries, print a report to stdout"""
    num_items = 0
    prev_org = None
    r = []
    for org, repo in sorted(repos.keys()):
        results = repos[(org, repo)]
        if org != prev_org:
            r.extend(['###', '###', '### ORGANIZATION: ' + org, '###', '###'])
        r.extend(['', '  #', '  # Repository: ' + org + '/' + repo, '  #'])
        num_items += len(results)
        for result in results:
            r.append('    Path: ' + result['path'])
            for lnum, line in result[FOUND_FIELD_NAME]:
                r.append('      line {}: {}'.format(lnum, line))
            r.append('')
        prev_org = org
    if num_items:
        r.extend([
            '',
            '# Found {} results in {} repositories!'.format(
                num_items, len(repos)),
            ''
        ])
    else:
        r = ['', '# Found no results!', '']
    return '\n'.join(r)


def get_args():
    """Return the args object based on cmd line args."""
    parser = argparse.ArgumentParser(description='Finds code in github')
    parser.add_argument('mailto', help='Email address to send report.')
    parser.add_argument('query', help='Search terms to look for')
    parser.add_argument(
        '--github', action='store', default=DEFAULT_GITHUB_BASE,
        help='Github base URL')
    parser.add_argument(
        '--language', action='store', help='Programming language')
    parser.add_argument(
        '--gocode', action='store_true', help='Look only for go code.')
    parser.add_argument(
        '--blacklist', action='append', default=[],
        help='Blacklisted repositories.')
    parser.add_argument(
        '--smtp', action='store', default='127.0.0.1',
        help='Mail server host.')
    parser.add_argument(
        '--mail-subject', action='store', default='Github Finder report.',
        help='Email subject.')
    args = parser.parse_args()
    return args


def main():
    """Main program entry"""
    args = get_args()
    results = retrieve_data(github_session(os.environ['GITHUB_AUTH']), args)
    text = report(grouper(results))
    print(text)
    mail(text, args.smtp, args.mailto, args.mail_subject)
    return len(results) > 0


if __name__ == "__main__":
    sys.exit(main())
