import argparse
import os
import sys
import json
import traceback
from Queue import PriorityQueue

class MetricNode:
    def __init__(self, name, path, depth=0):
        self.name = name
        self.path = path
        self.children = {}
        self.cardinality = 0
        self.depth=depth

    def add_node(self, path_list, max_depth=-1):
        # Increase cardinality because a new metric is being added to this sub-section of the tree
        self.cardinality += 1
        # Recursion base case
        if len(path_list) == 0:
            return
        # Max depth recursion base case
        if max_depth != -1 and self.depth >= max_depth:
            return
        # If the child hasn't been seen yet, instantiate it
        if path_list[0] not in self.children:
            if self.path == "":
                next_path = path_list[0]
            else:
                next_path = self.path+"."+path_list[0]
            self.children[path_list[0]] = MetricNode(path_list[0], next_path, self.depth+1)
        self.children[path_list[0]].add_node(path_list[1:], max_depth)


def run():
    print("Starting analysis of ingest data")
    # Argument parsing
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--ingestdir', help='path to a directory of newline-separated lists of metric lines (pcap files)')
    parser.add_argument('--outfile', help='file to output analysis data as JSON')
    parser.add_argument('--format', help='format to write output file in (json, csv)', default='csv', type=str)
    parser.add_argument('--top', action='store', dest='top', help='prints the top N sized buckets (default: 100)', default=100, type=int)
    parser.add_argument('--max-depth', action='store', dest='max_depth', help='the maximum bucket depth to consider for output', default=-1, type=int)
    parser.add_argument('--min-depth', action='store', dest='min_depth', help='the minimum bucket depth to consider for output', default=-1, type=int)
    parser.add_argument('--shift-root', action='store', dest='shift_root', help='shift the root of the tree; use to only look at some subsection (e.g. servers or stats.counters)', default=None, type=str)
    parser.add_argument('--print', action='store', dest='stdout', help='if specified, prints results to STDOUT (pretty, list)', default="list", type=str)
    args = parser.parse_args()
    if not args.ingestdir:
        print("--ingestdir must be specified")
        sys.exit(1)
    if not os.path.isdir(args.ingestdir):
        print("--ingestfile <path> must point at a readable dir")
        sys.exit(1)
    try:
        # Create data structures representing the data, not digging beyond
        # the specified max-depth
        (ingest_tree, ingest_hash) = process_ingest_data(args.ingestdir, args.max_depth)
        # If shift_root is set, change the base of the tree before continuing
        if args.shift_root:
            ingest_tree = shift_tree_root(ingest_tree, args.shift_root)
        # Reduce the tree down to meet the min_depth requirement
        # Flatten the tree for a sorted view of the data
        nodes = flatten_tree(ingest_tree, args.min_depth)
        count = min(len(nodes), args.top)
        nodes = sorted(nodes, key=lambda x: x[1], reverse=True)[:count]
        if args.outfile:
            if args.format == 'json':
                log_json(nodes, args.outfile, args.min_depth)
            if args.format == 'csv':
                log_csv(nodes, args.outfile, args.min_depth)
        if args.stdout == "pretty":
            print_tree(ingest_tree, nodes)
        if args.stdout == "list":
            print_list(nodes, count)


    except Exception as e:
        traceback.print_exc()
        sys.exit(1)
    return None

def process_ingest_data(ingestdir, max_depth=-1):
    root = MetricNode("<root>", "")
    seen_names = {}
    for filename in os.listdir(ingestdir):
        with open(os.path.join(ingestdir, filename)) as f:
            print("Processing pcap file: %s" % filename)
            for line in f:
                metric = line.split()[0]
                # Skip lines that cant be encoded into utf8
                try:
                    metric.decode('utf-8')
                except:
                    continue
                metric_path_list = metric.split(".")
                # Keep simple counters of the counts of each key seen. Maybe it
                # will be useful later. For now, we aren't using it.
                if metric not in seen_names:
                    root.add_node(metric_path_list, max_depth)
                    seen_names[metric] = 1
                else :
                    seen_names[metric] += 1
    return root, seen_names

def shift_tree_root(ingest_tree, shift_string):
    hops = shift_string.split(".")
    for hop in hops:
        if hop not in ingest_tree.children:
            raise KeyError('no such bucket for shifting: %s'.format(hop))
        else:
            ingest_tree = ingest_tree.children[hop]
    return ingest_tree


def flatten_tree(root_node, min_depth=-1):
    nodes = []
    # Adds non-leaves to the list of tuples
    # tracking graphite buckets and their metric
    # cardinality
    def explore_node(node):
        if min_depth == -1 or node.depth >= min_depth:
            nodes.append((node.path, node.cardinality, node.depth))
        if len(node.children) != 0:
            for child in node.children:
                explore_node(node.children[child])
    explore_node(root_node)
    return nodes

def print_list(nodes, count):
    # sort nodes by size
    print("#METRICS, BUCKET, DEPTH")
    for i in range(min(count, len(nodes))):
        print "%d, %s, %d" % (nodes[i][1], nodes[i][0], nodes[i][2])

def print_tree(tree_root, target_nodes):
    # convert the target nodes into a map
    targets = {}
    for n in target_nodes:
        targets[n[0]] = (n[1], n[2])
    def handle_node(node, prefix=""):
        # Figure out what line to print
        line = None
        for target in targets:
            c, d = targets[target]
            # Check for exact match
            if node.path == target:
                percent = float(node.cardinality)*100 / tree_root.cardinality
                line = "{:s}{:s}, {:d} (%{:.2f})".format(prefix, node.name, node.cardinality, percent)
                # If we find an exact target, break
                break
            # Check for bucket prefix match
            if target.startswith(node.path) and target[len(node.path)] == ".":
                line = "{:s}{:s}".format(prefix, node.name)
                # Don't break on a prefix match because we might find an exact match
            if node.path == "":
                line = node.name
        # Print the line, or bail if we are on a path that has no nodes down the tree
        if line:
          print(line)
          # Figure out how to augment the prefix, handling special cases
          if prefix == "":
              next_prefix = "   |--"
          else:
              next_prefix = "   " + prefix
          # Recurse over the children, DFS style
          for child in node.children:
              handle_node(node.children[child], next_prefix)
    handle_node(tree_root)
        

def log_json(nodes, outfile, min_depth=-1):
    # Convert the flattened nodes to a simple map to make
    # the JSON cleaner
    node_dict = {}
    for node in nodes:
        if min_depth != -1 and node[2] >= min_depth:
            node_dict[node[0]] = node[1]
    data = json.dumps(node_dict)
    f = open(outfile, 'w')
    f.write(data)

def log_csv(nodes, outfile, min_depth=-1):
    f = open(outfile, 'w')
    for node in nodes:
        if min_depth == -1 or node[2] >= min_depth:
            f.write('{},{}\n'.format(node[0], node[1]))
    f.close()

if __name__ == "__main__":
	run()
