#!/usr/bin/env python

"""
Static files server with directory listing.

Features:
 - listening both ipv4 and ipv6
 - HEAD method
 - directory listing
 - uses sendfile to send files with zero-copy overhead
 - tail -f mode of sending files
 - collect statistics
 - smart and browser friendly mime-type detection
 - dubugging by SIGHUP

Dependencies:
 - tornado 2.4+
 - pysendfile
 - futures
 - python-magic
"""

from __future__ import absolute_import

import re
import os
import sys
import stat
import time
import json
import shlex
import errno
import signal
import urllib
import httplib
import logging
import tarfile
import datetime
import operator as op
import mimetypes
import functools as ft
import itertools as it
import collections
import email.utils

import subprocess32 as sp

import tornado
import tornado.web
import tornado.ioloop
import tornado.escape
import tornado.httputil
import tornado.template
import tornado.websocket
import tornado.httpserver

from kernel.util import console

try:
    import libarchive
except ImportError:
    libarchive = None

import sendfile
from cachetools import LRUCache
import concurrent.futures

from sandbox.fileserver import butterfly  # noqa

from sandbox import common
import sandbox.common.types.task as ctt
import sandbox.common.types.misc as ctm
import sandbox.common.types.user as ctu
import sandbox.common.types.client as ctc
import sandbox.common.types.resource as ctr

import sandbox.agentr.client as aclient


logger = logging.getLogger("fileserver")
settings = None


def private(method):
    @ft.wraps(method)
    def wrapper(self, *args, **kws):
        if self.request.remote_ip != "127.0.0.1":
            raise tornado.web.HTTPError(httplib.FORBIDDEN)
        return method(self, *args, **kws)
    return wrapper


BYTES_SENT = 0  # for statistics

processing_requests = collections.defaultdict(set)


def register_watcher(handler, delegate):
    processing_requests[handler].add(delegate)


def notify(handler):
    observers = processing_requests.pop(handler, None)

    if observers is None:
        return

    for o in observers:
        try:
            o.close()
        except Exception as ex:
            logger.error("Exception in notify: %s", ex)


class BarrierError(ValueError):
    pass


class Barrier(object):
    def __init__(self, obj, message=None):
        self.__obj = obj
        self.__closed = False
        self.__message = message

    def __enter__(self):
        if self.__closed:
            raise BarrierError(self.__message if self.__message else "Object expired")
        return self.__obj

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def close(self):
        self.__closed = True


TAIL_PAGE_HTML = """
<!DOCTYPE html>
<html>
    <head>
        <title>tail -f {{ path }}</title>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <script src="//yastatic.net/jquery/1.8.3/jquery.min.js"></script>
    </head>

    <body>
        <pre id="tail"></pre>
        <script>
            var Application = function() {
                var g_size = null;

                var socket = new WebSocket('{{ url }}');

                setInterval( function() {
                    socket.send(JSON.stringify({size: g_size}));
                }, 2000);

                socket.onmessage = function ( event ) {
                    obj = jQuery.parseJSON(event.data)

                    $('#tail').text( $('#tail').text() + obj.data );
                    $(window).scrollTop($('#tail').height());

                    g_size = obj.size;
                }

                window.onbeforeunload = function() {
                    socket.onclose = function () {};
                    socket.close();
                }

                return {
                    socket : socket
                };
            };

            $(function() { Application(); });
        </script>
    </body>
</html>
"""


# template for directory listing
TMPL = """{% import urllib %}\
<!DOCTYPE html>
<html>
  <head>
    <title>Sandbox resource viewer at {{ host }}.</title>
    <style>
      body {
         font-family: monospace;
         font-size: 0.95em;
      }
      a {
        text-decoration: none;
      }
      a:hover {
        text-decoration: underline;
      }
      td {
        padding: 0px 5px 0px 5px;
        border-right: 1px dotted #aaa;
        white-space: nowrap;
      }
      .title {
         # white-space: nowrap;
      }
      .head-link {
         color: #aaa;
      }
      .file-row {
        float: left;
        width: 655px;
      }
    </style>
  </head>
  <body>
    <h3 class="title">Sandbox files at {{ host }}:
      <span class="head-link">
      {% for i, p in enumerate(root.split('/')) %}
        {% if not p %}
            {% continue %}
        {% end %}
        {% set short_name = p[:200] + '...' if len(p) > 203 else p %}
        <a href="{{ urllib.quote('/'.join(root.split('/')[:i + 1])) }}/">{{ short_name }}</a>/
      {% end %}
      &mdash; Total size: {{ size2str(total_size) }}</span></h3>
      {% if task or resource %}
        <h4>
        Source Sandbox
        {% if resource %}
        <a href="{{ resource.url }}" target='_blank'>resource #{{ resource.id }}</a> of
        {% end %}
        <a href="{{ task.url }}" target='_blank'>task #{{ task.id }}</a>
        </h4>
      {% end %}
      <table cellspacing="1" cellpadding="0">
      {% for i, f in enumerate(files) %}
            <tr{% if i % 2 == 0 %} style="background-color: #EEC"{% end %}>
            <td>
              <a title="{{ f.name }}" href="{{ urllib.quote(root + f.name) }}">
                {% set short_name = f.name[:200] + '...' if len(f.name) > 203 else f.name %}
                {% if f.type == "DIRECTORY" %}<b>{% end %}
                {% if f.type == "LINK" %}<i>@{% end %}{{ short_name }}{% if f.type == "LINK" %}</i>{% end %}
                {% if f.type == "DIRECTORY" %}</b>{% end %}
              </a>
            </td>
            <td>{% if f.size is not None %}{{ size2str(f.size) }}{% else %}&mdash;{% end %}</td>
            <td>{% if f.mtime %}{{ f.mtime - tz_offset }}{% else %}&mdash;{% end %}</td>
            <td>{% if f.type == "REGULAR" %}
              <a title="Show file as plain text" href="{{ urllib.quote(root + f.name) }}?force_text_mode=1">as text</a>
                {% else %}
              &mdash;
                {% end %}
            </td>
            <td>{% if f.type != "DIRECTORY" %}
              <a title="Download as binary" href="{{ urllib.quote(root + f.name) }}?force_binary_mode=1">download</a>
                {% else %}
                  {% if f.name != ".." %}
                    <a title="Download as tarball" href="{{ urllib.quote(root + f.name) }}?stream=tgz">download</a>
                  {% else %}
                    &mdash;
                  {% end %}
                {% end %}
            </td>
            <td>{% if tailable and f.type == "REGULAR" %}
                  <a title="Analog of UNIX tail -f command" style="color: forestgreen"
                  href="{{ urllib.quote(root + f.name) }}?tail=1&force_text_mode=1">tail</a>
                {% elif f.type == "ARCHIVE" %}
                  <a title="Browse an archive" style="color: forestgreen"
                  href="{{ urllib.quote(root + f.name) }}/">browse</a>
                {% else %}
                  &mdash;
                {% end %}
            </td>
            </tr>
      {% end %}
      </table>
      <!-- Total bytes sent by a server: {{ size2str(bytes_sent) }} -->
"""


class File(object):
    __slots__ = ("handle", "name", "fd")

    def __init__(self, handle, name, fd=None):
        self.handle = handle
        self.name = name
        self.fd = fd

    def open(self):
        pass

    def close(self):
        self.handle.close()


class ArchiveFile(File):
    __slots__ = ("ar", "size") + File.__slots__

    def __init__(self, ar, entry):
        super(ArchiveFile, self).__init__(None, os.path.basename(entry.pathname))
        self.ar = ar
        self.size = entry.size

    def open(self):
        self.handle = self.ar.readstream(self.size)

    def close(self):
        self.ar.close()


class Tarfile(File):
    __slots__ = ("tar", "size") + File.__slots__

    def __init__(self, tar, tarinfo):
        super(Tarfile, self).__init__(tar.extractfile(tarinfo), os.path.basename(tarinfo.name))
        self.tar = tar
        self.size = tarinfo.size

    def close(self):
        self.tar.close()


def get_mime(file_):
    """
    Try to guess the mime type
    Firstly use standart mimetypes module.
    If python-magic is available use it to re-guess file mime.
    """
    mime_type = None
    # custom mime-types

    if isinstance(file_, File):
        if any(map(file_.name.endswith, ('.log', '.txt', '.out'))):
            return 'text/plain; charset=UTF-8'
        try:
            import magic
            mime_type = magic.from_buffer(file_.handle.read(0x3FFF), mime=True)
            file_.handle.seek(0)
        except:
            # Archive can't be read, try guess mime-type by file extension
            mime_type, _ = mimetypes.guess_type(file_.name)
    else:
        if any(map(file_.endswith, ('.log', '.txt', '.out'))):
            return 'text/plain; charset=UTF-8'
        # guess by file extension
        mime_type, _ = mimetypes.guess_type(file_)
        # guess by file contents
        if not mime_type:
            try:
                import magic
                mime_type = magic.from_file(file_, mime=True)
            except ImportError:
                pass

    # convert text/x-* to text/plain
    if mime_type and mime_type.startswith('text/'):
        mime_type = ('text/plain' if mime_type.startswith('text/-x') else mime_type) + '; charset=UTF-8'
    return mime_type


@common.utils.singleton
def get_pool():
    return concurrent.futures.ThreadPoolExecutor(max_workers=50)


@common.utils.singleton
def get_loop():
    return tornado.ioloop.IOLoop.instance()


class FileServerHandler(tornado.web.RequestHandler):
    """
    Serve static files and show directory listing.

    Based on builtin tornodo.web.StaticFileHandler but
    uses sendfile instead of read/write to send a file.
    Also support directory listing.
    Works with broken symlinks and unicode file names.
    Supports tail -f mode.
    """
    requests_in_progress = 0

    CACHE_MAX_AGE = 86400 * 365 * 10  # 10 years

    class TarballChunkedStream(object):
        """ The class will write chunks of tarball stream as HTTP chunked transfer stream. """
        CRLF = "\r\n"
        STREAM_OPTIONS = {
            "tgz": {"mode": "gz", "suffix": "tar.gz", "content": "application/x-gzip"},
            "tar": {"mode": "", "suffix": "tar", "content": "application/x-gtar"},
        }

        def __init__(self, handler, mode):
            self.counter = 0
            self.handler = handler
            self.mode = mode
            self.socket = handler.request.connection.stream.socket

        def async_stream(self, path, basename):
            def ti_filter(tarinfo):
                tarinfo.mode |= stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH
                return tarinfo

            try:
                with tarfile.open(mode="w|" + self.mode, fileobj=self, bufsize=0x3FFFFF) as tar:
                    tar.add(path, arcname=basename, filter=ti_filter)
                self.write("")
            except Exception as ex:
                self.socket = None
                logger.error(
                    "Error sending tarball stream of %r: %s (sent so far %d bytes)",
                    path, str(ex), self.counter
                )
            finally:
                global BYTES_SENT
                BYTES_SENT += self.counter
                get_loop().add_callback(self.handler.finish)

        def write(self, data):
            if not self.socket:
                return
            try:
                self.socket.setblocking(True)
                self.socket.sendall("".join((hex(len(data))[2:], self.CRLF, data, self.CRLF)))
                self.counter += len(data)
            finally:
                self.socket.setblocking(False)

    class FileInfo(common.patterns.Abstract):
        class Type(common.utils.Enum):
            REGULAR = None
            ARCHIVE = None
            UNKNOWN = None
            DIRECTORY = None
            LINK = None

        __slots__ = ("name", "type", "size", "ctime", "mtime", "target")
        __defs__ = (None,) * 6

    Browse = collections.namedtuple("Browse", ("point", "files"))

    def initialize(self, path):
        self.root = os.path.abspath(path) + os.path.sep

    def get_cache_time(self, path, modified, mime_type):
        """Override to customize cache control behavior.

        Return a positive number of seconds to trigger aggressive caching or 0
        to mark resource as cacheable, only.

        By default returns cache expiry of 10 years for resources requested
        with "v" argument.
        """
        return self.CACHE_MAX_AGE if "v" in self.request.arguments else 0

    def parse_url_path(self, url_path):
        """Converts a static URL path into a filesystem path.

        ``url_path`` is the path component of the URL with
        ``static_url_prefix`` removed. The return value should be
        filesystem path relative to ``static_path``.
        """
        if os.path.sep != "/":
            url_path = url_path.replace("/", os.path.sep)
        return url_path

    def render_from_string(self, tmpl, **kwargs):
        namespace = self.get_template_namespace()
        namespace.update(kwargs)
        namespace.update({
            "bytes_sent": BYTES_SENT,
            "size2str": common.utils.size2str,
        })
        return tornado.template.Template(tmpl).generate(**namespace)

    def _cors_headers(self):
        self.set_header('Access-Control-Allow-Origin', '*')
        hdrs = self.request.headers.get('Access-Control-Request-Headers')
        if hdrs:
            self.set_header('Access-Control-Allow-Headers', hdrs)

    def _handle_range(self, content_size):
        """
        Code is partially copied from `tornado.web.StaticFileHandler.get` method.
        See some code comments there.

        :return (offset, nbytes, is_finished) tuple
        """
        request_range = None
        range_header = self.request.headers.get("Range")
        if range_header:
            request_range = tornado.httputil._parse_request_range(range_header)  # noqa
        if request_range:
            start, end = request_range
            if (start is not None and start >= content_size) or end == 0:
                self.set_status(httplib.REQUESTED_RANGE_NOT_SATISFIABLE)
                self.set_header("Content-Type", "text/plain")
                self.set_header("Content-Range", "bytes */{}".format(content_size))
                return None, None, True
            if start is not None and start < 0:
                start = max(start + content_size, 0)
            if end is not None and end > content_size:
                end = content_size
            if content_size != (end or content_size) - (start or 0):
                self.set_status(httplib.PARTIAL_CONTENT)
                self.set_header("Content-Range", tornado.httputil._get_content_range(start, end, content_size))  # noqa
        else:
            start = end = None
        start = start or 0
        end = end or content_size
        return start, end - start, False

    def _send_file_chunk(self, file_, fd_out_barrier, tail, offset=0, nbytes=None):
        """
        Send file via sendfile.
        XXX: exceptions from this function will be hidden
        """
        chunk_max_size = 0x1FFF
        chunk = None
        file_.open()
        global BYTES_SENT

        while True:
            try:
                with fd_out_barrier as fd_out:
                    chunk_size = chunk_max_size if nbytes is None else min(chunk_max_size, nbytes)
                    if file_.fd:
                        sent = sendfile.sendfile(fd_out, file_.fd, offset, chunk_size)
                    else:
                        if not chunk:
                            chunk = file_.handle.read(chunk_size) or ""
                        sent = os.write(fd_out, chunk)
                        chunk = chunk[sent:] if sent < len(chunk) else None

                    if not sent:
                        if tail and file_.fd:
                            time.sleep(0.1)
                        else:
                            get_loop().add_callback(self.finish)
                            break
                    else:
                        offset += sent
                        BYTES_SENT += sent
                        # Tornado 4.x has its own byte counter
                        self.request.connection._expected_content_remaining -= sent
                        if nbytes is not None:
                            nbytes -= sent
            except OSError as e:
                if e.errno in (errno.EAGAIN, errno.EWOULDBLOCK):
                    time.sleep(0.03)
                else:
                    logger.error("Exception in sendfile_chunk: %s\n" % e)
                    logger.exception(e)
                    get_loop().add_callback(self.finish)
                    break
            except BarrierError:
                logger.info("Client disconnected. Stop transfer.")
                get_loop().add_callback(self.finish)
                break
            except Exception as e:
                logger.error("Exception in sendfile_chunk: %s\n" % e)
                logger.exception(e)
                get_loop().add_callback(self.finish)
                break

        file_.close()

    def _send_file(self, include_body, path, abspath, archive=None):
        """
        Send file to the client socket using sendfile.

        Firstly, prepare and send HTTP headers,
        then using sendfile, send file content using sendfile by chunks.
        After all content is sent, close client socket (unless ?tail=1 in
        query string). If force_binary_mode is set then Content-Type
        is set to application/octet-stream

        Args:
            - include_body: if False then assume that HEAD method was used
                            and set only Content-Length header instead of
                            sending the whole file.
            - path:         path to file as in URL
            - abspath:      physical location of the file
            - archive:      `Tarfile` object to send file from.
        """
        tail = self.request.arguments.get("tail") if not archive else False

        if tail:
            logger.info('Using TAIL for "{0}"'.format(path))

        fname = os.path.basename(abspath) if not archive else archive.name
        try:
            fname = fname.encode("utf-8")
        except UnicodeDecodeError:
            pass
        self.set_header(
            'Content-Disposition',
            '{}; filename="{}"'.format(
                'attachment' if self.request.arguments.get("force_binary_mode") else 'inline',
                fname
            )
        )
        if self.request.arguments.get("force_text_mode"):
            mime_type = 'text/plain; charset=UTF-8'
        else:
            mime_type = get_mime(abspath if not archive else archive) or 'text/plain; charset=UTF-8'

        if mime_type:
            self.set_header("Content-Type", mime_type)
        self._cors_headers()

        offset, nbytes = 0, None
        if not tail:
            stat_result = os.stat(abspath)
            file_size = archive.size if archive else stat_result[stat.ST_SIZE]

            offset, nbytes, is_finished = self._handle_range(file_size)
            if is_finished:
                self.finish()
                return

            modified = datetime.datetime.utcfromtimestamp(stat_result[stat.ST_MTIME])
            if not nbytes:
                self.set_header("Content-Type", "text/plain")  # Force plain text for empty files.
            self.set_header("Content-Length", nbytes)
            self.set_header("Connection", "close")
            self.set_header("Last-Modified", modified)

            cache_time = self.get_cache_time(path, modified, mime_type)

            if cache_time > 0:
                self.set_header(
                    "Expires",
                    datetime.datetime.utcnow() + datetime.timedelta(seconds=cache_time)
                )
                self.set_header("Cache-Control", "max-age=" + str(cache_time))
            else:
                self.set_header("Cache-Control", "public")

            # Check the If-Modified-Since, and don't send the result if the
            # content has not been modified
            ims_value = self.request.headers.get("If-Modified-Since")
            if ims_value is not None:
                date_tuple = email.utils.parsedate_tz(ims_value)
                if_since = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(date_tuple))
                if if_since >= modified:
                    self.set_status(304)
                    self.finish()
                    return

        if not include_body:
            self.finish()
            return

        if not archive:
            try:
                archive = File(open(abspath), fname)
                archive.fd = archive.handle.fileno()
            except IOError as ex:
                raise tornado.web.HTTPError(httplib.FORBIDDEN, str(ex))

        self.flush()  # send HTTP headers to socket

        # send body from a thread
        if tail:
            archive.handle.seek(sum(map(len, common.fs.tail(archive.handle, 10))), os.SEEK_END)

        b = Barrier(self.request.connection.stream.socket.fileno(), "Client disconnected")
        register_watcher(self, b)

        self._send_file_chunk(archive, b, tail, offset, nbytes)

    def compute_etag(self):
        """ `Etag` header calculation is very CPU-intensive operation - do not perform it. """
        return

    def finish(self, *args, **kwargs):
        # Override `RequestHandler.finish` method directly, because
        # `RequestHandler.on_finish` may not be called in case of error
        # and `RequestHandler.on_connection_close` is rather unpredictable.
        notify(self)
        FileServerHandler.requests_in_progress -= 1
        return super(FileServerHandler, self).finish(*args, **kwargs)

    def _get_file_type(self, path):
        if os.path.islink(path):
            return self.FileInfo.Type.LINK
        elif os.path.isdir(path):
            return self.FileInfo.Type.DIRECTORY
        elif os.path.isfile(path):
            if any(it.imap(path.endswith, (".tar", ".tgz", ".tar.gz", ".tbz", ".tbz2", ".tar.bz2", ".tar.zstd"))):
                return self.FileInfo.Type.ARCHIVE
            else:
                return self.FileInfo.Type.REGULAR
        else:
            return self.FileInfo.Type.UNKNOWN

    def _build_json_response(self, files, root):
        result = {}

        for file_info in files:
            file_path = os.path.join(root, file_info.name)
            result[file_info.name] = {
                "type": file_info.type,
                "path": file_path,
                "url": "{}://{}{}".format(
                    self.request.protocol, self.request.host, urllib.quote(file_path.replace("\\", "/"))
                ),
                "size": file_info.size,
                "time": {
                    "created": file_info.ctime.isoformat() + "Z",
                    "modified": file_info.mtime.isoformat() + "Z",
                }
            }
            if file_info.type == self.FileInfo.Type.LINK:
                result[file_info.name]["target"] = file_info.target
        return json.dumps(result, ensure_ascii=False).encode("utf8")

    def _list_builder(self, include_body, root, files, tailable=True):
        """ Build HTML or JSON presentation for given files list. """

        self.set_header("Connection", "close")
        self.set_header("X-Files-List", "true")
        if not include_body:
            return self.finish()

        entity_t = collections.namedtuple("Entity", ("id", "url"))
        task, resource = None, None
        if ctm.HTTPHeader.FORWARDED_PATH in self.request.headers:
            root = self.request.headers[ctm.HTTPHeader.FORWARDED_PATH]
            path = root.strip('/').split('/')
        else:
            # prepend and append slashes for correct urls rendering
            path = root.strip('/').split('/')
            if len(path) > 2:
                tid = ''.join(path[2])
                task = entity_t(tid, common.utils.get_task_link(tid, settings))

        if not root.startswith('/'):
            root = '/' + root
        if not root.endswith('/'):
            root += '/'

        root_url = (
            "http://{}:{}".format(settings.server.web.address.host, settings.server.api.port)
            if settings.server.web.address.show_port else
            "https://" + settings.server.web.address.host
        )
        if ctm.HTTPHeader.TASK_ID in self.request.headers:
            tid = self.request.headers[ctm.HTTPHeader.TASK_ID]
            task = entity_t(tid, "/".join([root_url, "task", tid, "view"]))
        if ctm.HTTPHeader.RESOURCE_ID in self.request.headers:
            rid = self.request.headers[ctm.HTTPHeader.RESOURCE_ID]
            resource = entity_t(rid, "/".join([root_url, "resource", rid, "view"]))

        if "application/json" in self.request.headers.get("Accept", ""):
            self.set_header("Content-Type", "application/json; charset=utf-8")
            result = self._build_json_response(files, root=root)
        else:
            if task and len(path) > 3 or not task and len(path) > 1:
                files.insert(0, self.FileInfo(
                    "..", self.FileInfo.Type.DIRECTORY, None, None, None, False,
                ))
            self.set_header("Content-Type", "text/html; charset=utf-8")
            for f in files:
                try:
                    f.name = f.name.encode("utf-8")
                except UnicodeDecodeError:
                    pass
            result = self.render_from_string(
                TMPL,
                root=root,
                host=settings.this.fqdn,
                files=sorted(files, key=lambda f: f.type != self.FileInfo.Type.DIRECTORY),
                total_size=sum(f.size for f in files if f.size),
                task=task,
                resource=resource,
                tailable=tailable,
                tz_offset=datetime.timedelta(seconds=time.timezone)
            )

        self.set_header("Content-Length", len(result))
        global BYTES_SENT
        BYTES_SENT += len(result)
        self.finish(result)

    def _list_dir(self, include_body, root, abspath):
        """
        Lists directory given.

        If directory is very large, a lots of stat calls will slow down page generation.

        Note: This method uses Etag based caching.
        """

        files = []
        for fname in sorted(os.listdir(abspath)):
            fullpath = os.path.join(abspath, fname)
            if os.path.exists(fullpath):
                stat_result = os.stat(fullpath)
            elif os.path.islink(fullpath):
                stat_result = os.lstat(fullpath)
            else:
                continue
            f = self.FileInfo(
                fname,
                self._get_file_type(fullpath),
                stat_result[stat.ST_SIZE],
                datetime.datetime.utcfromtimestamp(stat_result[stat.ST_CTIME]),
                datetime.datetime.utcfromtimestamp(stat_result[stat.ST_MTIME]),
                os.readlink(fullpath) if os.path.islink(fullpath) else None,
            )

            files.append(f)

        # due to sort stability, we can re-sort by filetype
        return self._list_builder(include_body, root, files)

    def _browse_archive(self, include_body, url_path, archive, relpath):
        def append(browse, entry, name):
            fi = self.FileInfo(
                name,
                self.FileInfo.Type.UNKNOWN,
                entry.size,
                datetime.datetime.fromtimestamp(entry.mtime),  # "TarInfo" object has no attribute "ctime"
                datetime.datetime.fromtimestamp(entry.mtime),
                entry.issym(),
            )
            if entry.isfile():
                fi.type = self.FileInfo.Type.REGULAR
            elif entry.isdir():
                fi.type = self.FileInfo.Type.DIRECTORY
            browse.files.append(fi)

        class FileNotFound(Exception):
            pass

        browse = None
        started, entries = time.time(), 0
        try:
            ar = libarchive.Archive(archive) if libarchive else tarfile.open(archive, 'r:*')
            if not include_body:
                return self._list_builder(include_body, url_path, [], False)
            entry, path = None, None  # just to avoid IDE warnings
            for entry in ar:
                path = os.path.normpath(entry.pathname if libarchive else entry.name)
                if path == ".":
                    continue
                if not relpath or path.encode("utf-8") == relpath:
                    break
            else:
                if relpath:
                    raise FileNotFound

            lost_entries = []
            if not relpath:
                # libarchive.Archive can be iterated just once, so the first entry gets lost in this case
                lost_entries = [entry] if libarchive else []
                browse = self.Browse("", [])
            elif entry.isdir():
                browse = self.Browse(path, [])
            else:
                return self._send_file(
                    include_body, url_path, archive,
                    (ArchiveFile if libarchive else Tarfile)(ar, entry)
                )
            logger.debug("Browse point object: %r", browse)
            tree = {}
            for entry in it.chain(lost_entries, ar):
                entries += 1
                path = os.path.normpath((entry.pathname if libarchive else entry.name))
                if path == ".":
                    continue
                parts = path.split("/")
                last_dir = reduce(lambda a, i: a.setdefault(i, {}), parts, tree)
                last_dir[None] = entry
            root_dir = reduce(lambda a, i: a.get(i), browse.point.split("/") if browse.point else (), tree) or {}
            for name, items in sorted(root_dir.iteritems(), key=op.itemgetter(0)):
                if name is None:
                    continue
                entry = items.get(None)
                if not entry:
                    if libarchive:
                        entry = libarchive.Entry(pathname=name, mode=stat.S_IFDIR, mtime=0)
                    else:
                        entry = tarfile.TarInfo(name)
                        entry.type = tarfile.DIRTYPE
                append(browse, entry, name)
        except FileNotFound:
            logger.warning("File '%s' not found in archive '%s'.", relpath, archive)
        except Exception as ex:
            msg = "Unsupported archive media type: {} ({})".format(get_mime(archive), str(ex))
            logger.exception(msg)
            raise tornado.web.HTTPError(httplib.UNSUPPORTED_MEDIA_TYPE, msg)
        finally:
            logger.debug(
                "Archive '%s' scanned (checked %d entries) with %s in %.2fs",
                archive, entries, "libarchive" if libarchive else "tarfile", time.time() - started
            )
        if browse:
            return self._list_builder(include_body, url_path, sorted(browse.files, key=lambda f: f.name), False)
        raise tornado.web.HTTPError(httplib.NOT_FOUND, "File '{}' not found in archive '{}'.".format(relpath, archive))

    def _stream(self, path, kind):
        basename = os.path.basename(path)
        logger.info("Sending %r stream with %r (%r)", kind, basename, path)
        stream_options = self.TarballChunkedStream.STREAM_OPTIONS[kind]
        self.set_header("Content-Type", stream_options["content"])
        self.set_header("Transfer-Encoding", "chunked")
        self.set_header(
            "Content-Disposition", 'attachment; filename="{}.{}"'.format(basename, stream_options["suffix"])
        )
        self.flush()
        self.TarballChunkedStream(self, stream_options['mode']).async_stream(path, basename)

    @staticmethod
    def _extract_task_id_from_path(nodes):
        if len(nodes) < 3:
            return None

        try:
            assumed_task_id = nodes[2]
            if nodes[0:3] == ctt.relpath(assumed_task_id):
                return assumed_task_id
            else:
                return None

        except ValueError:
            return None

    def options(self, *args, **kwargs):
        self.set_header('Allow', 'HEAD,GET')
        self._cors_headers()
        self.finish()

    def prepare(self):
        # `on_finish` is always called so is the decrement
        FileServerHandler.requests_in_progress += 1

        if FileServerHandler.requests_in_progress > settings.client.fileserver.max_requests_in_progress:
            self.set_status(httplib.SERVICE_UNAVAILABLE, reason="Too many requests in progress")
            self.finish()
            raise tornado.web.Finish

    @tornado.web.asynchronous
    def head(self, path):
        get_pool().submit(self.get_handle, path, False)

    @tornado.web.asynchronous
    def get(self, path):
        """
        Async wrapper for all GET request
        """
        get_pool().submit(self.get_handle, path, True)

    def get_handle(self, path, include_body=True):
        try:
            if isinstance(path, unicode):
                path = self.parse_url_path(path).encode('utf-8')

            nodes = path.split("/")
            if path.startswith("resource/"):
                self.root = settings.client.resources.data_dir
                try:
                    nodes = list(ctr.relpath(int(nodes[1]))) + nodes[2:]
                except ValueError:
                    raise tornado.web.HTTPError(httplib.BAD_REQUEST, "Bad path: %s", path)

            if ctc.Tag.NEW_LAYOUT in settings.client.tags:
                task_id = self._extract_task_id_from_path(nodes)
                if task_id is not None:
                    Auth.check(self.request, task_id)

            abspath = os.path.abspath(os.path.join(self.root, *nodes))

            # don't access above root
            if self.root not in abspath + '/':
                raise tornado.web.HTTPError(httplib.FORBIDDEN, "%s bad path", path)

            stream = next(iter(self.request.arguments.get("stream") or []), "")
            if stream and stream.lower() not in self.TarballChunkedStream.STREAM_OPTIONS.viewkeys():
                raise tornado.web.HTTPError(httplib.BAD_REQUEST, "Unknown stream mode '{}' requested.".format(stream))

            archive_relpath = None
            if not os.path.exists(abspath) or (os.path.isfile(abspath) and path.endswith('/')):
                # Maybe it is a relative link inside an archive
                current = self.root
                node = current
                while nodes:
                    node = os.path.join(current, nodes.pop(0))
                    if os.path.isdir(node):
                        current = node
                    elif os.path.isfile(node):
                        break
                    else:
                        raise tornado.web.HTTPError(httplib.NOT_FOUND)
                archive_relpath = '/'.join(nodes)
                abspath = node

            if stream:
                self._stream(abspath, stream)
            elif os.path.isdir(abspath):
                self._list_dir(include_body, path, abspath)
            elif os.path.isfile(abspath):
                tail = self.request.arguments.get("tail")
                if tail:
                    logger.info("Using TAIL for %r", path)
                    fs_settings = settings.client.fileserver
                    if fs_settings.proxy.host:
                        path = os.path.join(*path.split(os.path.sep)[2:])
                        url = "{}://{}/tail/{}".format(fs_settings.proxy.scheme.ws, fs_settings.proxy.host, path)
                    else:
                        url = "ws://{}:{}/tail/{}".format(settings.this.fqdn, fs_settings.port, path)
                    html = self.render_from_string(
                        TAIL_PAGE_HTML,
                        path=path,
                        url=url
                    )
                    self.finish(html)
                elif archive_relpath is not None:
                    archive_relpath = archive_relpath.rstrip("/")
                    logger.info("Browse %r file from %r archive.", archive_relpath, abspath)
                    self._browse_archive(include_body, path, abspath, archive_relpath)
                else:
                    self._send_file(include_body, path, abspath)
        except tornado.web.HTTPError as ex:
            self.set_status(ex.status_code, reason=ex.reason)
            self.send_error(status_code=ex.status_code)
        except Exception as ex:
            logger.exception("Error while processing request %r", path)
            status_code = 403 if isinstance(ex, (OSError, IOError)) else 500
            self.send_error(status_code=status_code, exc_info=sys.exc_info())


class LogCache(object):
    __metaclass__ = common.utils.ThreadSafeSingletonMeta

    DATE_FMT = "\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}[,.]\d{3}"
    LINE_PATTERN = re.compile(r"""
        ^(?P<time>{})\s+
        (\(\s+[\d.,]+s\)\s+)? # duration, which does exist in common.types.task.TASK_LOG_FORMAT, but may be omitted
        (?P<level>\w+)\s+
        \((?P<component>\w+)\)
    """.format(DATE_FMT), re.VERBOSE)

    class Entry(common.patterns.Abstract):
        __slots__ = ("last_size", "offset")
        __defs__ = (0, [0])

    __cache = collections.defaultdict(Entry)

    @classmethod
    def _refresh(cls, path):
        """
        Update offset cache for a given log
        :param path: path to log to open
        """

        with open(path, "rb") as log:
            pos = cls.__cache[path].offset[-1]
            log.seek(pos)

            first_read = True
            for line in log:

                # we always re-read the last of previously read lines in case it has grown between refreshes,
                # and its offset is already in the cache, even if it's the first line in file, so we always skip it

                if re.match(cls.LINE_PATTERN, line) and not first_read:
                    cls.__cache[path].offset.append(pos)
                first_read = False
                pos += len(line)
            cls.__cache[path].last_size = pos

    @classmethod
    def _read(cls, path, start):
        """
        Extract and parse lines from a log file with the aid of offset cache

        :param path: path to read from
        :param start: first line number
        """

        lines = []
        if start >= len(cls.__cache[path].offset):
            return lines

        with open(path, "rb") as log:
            log.seek(cls.__cache[path].offset[start])
            for i in xrange(start, len(cls.__cache[path].offset)):
                if i + 1 < len(cls.__cache[path].offset):
                    amount = cls.__cache[path].offset[i + 1] - cls.__cache[path].offset[i]
                else:
                    amount = -1

                line = log.read(amount).strip()
                group = re.split(cls.LINE_PATTERN, line)

                if len(group) == 1:  # no match
                    data = {
                        "time": None,
                        "level": None,
                        "component": None,
                        "content": line
                    }
                else:
                    data = {
                        "time": group[1],
                        "level": group[3],
                        "component": group[4],
                        "content": group[5].strip()
                    }
                data.update(no=i)
                lines.append(data)

        return lines

    @classmethod
    def get(cls, path, start):
        """
        Return tail of the log, refreshing log cache (re-reading the file, possibly from the start) if necessary

        :param path: path to log file to read from
        :param start: number of line to start reading from
        :return: a list of dicts of the following structure:

            :no: line number
            :time: ISO 8601 datetime stamp
            :level: event severity
            :component: logging component name
            :content: logged line

        """

        if (
            (path not in cls.__cache) or  # first time access
            (os.path.getsize(path) > cls.__cache[path].last_size)  # the log has changed
        ):
            cls._refresh(path)

        if start is None:
            start = max(len(cls.__cache[path].offset) - 100, 0)
        # noinspection PyBroadException
        try:
            return cls._read(path, start)
        except:
            logging.exception("Failed to read log %r from line #%r", path, start)
            return []


class TailWebSocket(tornado.websocket.WebSocketHandler):
    TAIL_SIZE = 1024 * 3
    _path = None

    def open(self, relpath):
        self._path = os.path.join(settings.client.tasks.data_dir, relpath)

    def on_message(self, message):
        message = json.loads(message)
        size, line = map(message.get, ("size", "line"))
        c_size = os.path.getsize(self._path)

        if "line" in message:  # takes precedence over "size" keyword
            self.write_message({"data": LogCache.get(self._path, line)})
            return

        if size is None:
            size = c_size
            offset = 0
            if size > self.TAIL_SIZE:
                offset = size - self.TAIL_SIZE
            with open(self._path, "rb") as f:
                f.seek(offset)
                data = f.read()
                self.write_message({"data": data, "size": size})
            return

        if size == c_size:
            return

        with open(self._path, "rb") as f:
            f.seek(size)
            size = c_size
            data = f.read()
            self.write_message({"data": data, "size": size})


class Auth(object):
    """ Checks the request is authenticated and has permissions for given task """

    __auth_cache = LRUCache(2000)

    @classmethod
    def session_or_token(cls, request):
        session_id = request.cookies.get(common.proxy.YandexSession.COOKIE_NAME)
        if session_id:
            session_id = session_id.value
            return session_id, common.proxy.YandexSession(session_id)

        oauth = request.headers.get("Authorization", "").split(" ")
        if oauth[0] == "OAuth":
            return oauth[1], common.proxy.OAuth(oauth[1])

        return None, None

    @classmethod
    def check(cls, request, tid):
        if settings.common.installation in common.types.misc.Installation.Group.LOCAL:
            return common.proxy.NoAuth()

        session_or_token, auth = cls.session_or_token(request)
        if not auth:
            raise tornado.web.HTTPError(httplib.FORBIDDEN, "No permissions for requested path.")

        cache_key = (session_or_token, tid)

        access = cls.__auth_cache.get(cache_key)
        rest_client = common.rest.Client(auth=auth)
        if access is None:
            task_meta = rest_client.task[tid][:]
            access = (task_meta["rights"] == ctu.Rights.WRITE)
            cls.__auth_cache[cache_key] = access
        if not access:
            raise tornado.web.HTTPError(httplib.FORBIDDEN, "No permissions for requested path.")
        return auth


@common.utils.singleton
def agentr():
    return aclient.Service(logger)


class ProcessListHandler(tornado.web.RequestHandler):
    """ Process list handler """

    TIMEOUT = 3  # timeout to receive process list, in seconds

    def __process_list(self, cmd):
        try:
            return sp.check_output(
                cmd, shell=True, preexec_fn=common.os.User.Privileges().__enter__, timeout=self.TIMEOUT
            )
        except Exception as ex:
            return str(ex)

    @tornado.web.asynchronous
    def get(self, tid):
        fs_meta = agentr().fileserver_meta[tid]
        if not fs_meta or not fs_meta.ps_command:
            return self.finish()

        Auth.check(self.request, tid)
        cmd = str(fs_meta.ps_command)
        html = "<pre>{}</pre>".format(tornado.escape.xhtml_escape(self.__process_list(cmd)))
        self.set_header("Access-Control-Allow-Origin", "*")
        hdrs = self.request.headers.get("Access-Control-Request-Headers")
        if hdrs:
            self.set_header("Access-Control-Allow-Headers", hdrs)
        self.finish(html)


class DebuggerRunner(tornado.web.RequestHandler):
    """ Debugger process runner """

    @tornado.web.asynchronous
    def post(self, tid):
        fs_meta = agentr().fileserver_meta[tid]
        if not fs_meta or not fs_meta.attach_command:
            raise tornado.web.HTTPError(httplib.NOT_FOUND, "No task #{} session registered".format(tid))

        hrds = self.request.headers
        rest = common.rest.Client(auth=Auth.check(self.request, tid))
        (rest << rest.HEADERS({
            ctm.HTTPHeader.REAL_IP: self.request.headers.get(ctm.HTTPHeader.REAL_IP, self.request.remote_ip),
            ctm.HTTPHeader.FORWARDED_FOR: self.request.headers.get(ctm.HTTPHeader.FORWARDED_FOR, self.request.remote_ip)
        })).task[tid].audit(message="Remote debugger session started via {}.{} from {}".format(
            hrds.get(ctm.HTTPHeader.BACKEND_NODE),
            hrds.get(ctm.HTTPHeader.INT_REQUEST_ID),
            hrds.get(ctm.HTTPHeader.REAL_IP)
        ))
        try:
            target = json.loads(self.request.body)["target"]
            host = target["host"]
            port = int(target["port"])
        except (KeyError, ValueError, TypeError) as ex:
            raise tornado.web.HTTPError(httplib.BAD_REQUEST, "Request body parse error: " + str(ex))

        try:
            if settings.common.installation == common.types.misc.Installation.LOCAL:
                host = "fc00::1"
            port = int(port)
        except (AttributeError, ValueError, TypeError) as ex:
            raise tornado.web.HTTPError(
                httplib.BAD_REQUEST, "Format of 'target' parameter is incorrect: " + str(ex)
            )

        cmd = shlex.split(fs_meta.attach_command.format(host=host, port=port, pid=fs_meta.pid))
        logging.info("Starting debugger for process #%r of task #%r by command %r", fs_meta.pid, tid, cmd)

        proc = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE, preexec_fn=common.os.User.Privileges().__enter__)
        stdout, stderr = proc.communicate()
        if proc.returncode:
            self.set_status(httplib.SERVICE_UNAVAILABLE, "Debugger process exit unexpectedly with RC {}".format(
                proc.returncode
            ))

        self.set_header("Access-Control-Allow-Origin", "*")
        hdrs = self.request.headers.get("Access-Control-Request-Headers")
        if hdrs:
            self.set_header("Access-Control-Allow-Headers", hdrs)
        self.set_header("Content-Type", "text/plain")
        self.finish("STDOUT:\n" + stdout + "\nSTDERR:\n" + stderr + "\nCODE: " + str(proc.returncode))


class CurrentActionsHandler(tornado.web.RequestHandler):
    @tornado.web.asynchronous
    def get(self, tid):
        Auth.check(self.request, tid)
        meta = agentr().progress_meta.by_task(tid)
        if meta is None:
            self.set_status(httplib.NO_CONTENT)
            return self.finish()

        # XSS protection: https://github.com/tornadoweb/tornado/issues/1009
        self.finish(dict(
            actions=[item.encode() for item in meta]
        ))


class Application(tornado.web.Application):
    @staticmethod
    def log_request(handler):
        """Writes a completed HTTP request to the logs.

        By default writes to the python root logger.  To change
        this behavior either subclass Application and override this method,
        or pass a function in the application settings dictionary as
        ``log_function``.
        """
        if handler.get_status() < 400:
            log_method = logger.info
        elif handler.get_status() < 500:
            log_method = logger.warning
        else:
            log_method = logger.error
        request_time = 1000.0 * handler.request.request_time()
        log_method(
            "%d %s \"%s\" %.2fms",
            handler.get_status(), handler._request_summary(),
            handler.request.headers.get("User-Agent"), request_time
        )


class FileServer(object):

    def __init__(self, file_server_dir):
        self.need_serve_after_stop = False
        self._butterfly_dir = os.path.join(file_server_dir, "butterfly")
        self._impl = tornado.httpserver.HTTPServer(self.__application(), no_keep_alive=False)

    def __application(self):
        app = Application(
            [
                (r"/tail/(.*)", TailWebSocket),
                (r"/ps/([0-9]+)", ProcessListHandler),
                (r"/debugger/([0-9]+)", DebuggerRunner),
                (r"/actions/([0-9]+)", CurrentActionsHandler),
                (r"^/(.*)", FileServerHandler, {"path": settings.client.tasks.data_dir}),
            ],
            transforms=[],  # XXX: chunking does not work with tail mode
            gzip=False,  # XXX: gzip=True does not work with tail mode
            debug=False,  # XXX: debug=True does not work with daemonzation
            static_path=os.path.join(self._butterfly_dir, "static"),
            template_path=os.path.join(self._butterfly_dir, "templates"),
            static_url_prefix="/shell/static/"
        )
        app.systemd = None
        app.butterfly_dir = self._butterfly_dir

        global butterfly
        butterfly.application = app
        butterfly.sandbox_settings = settings
        from .butterfly import routes  # noqa
        return app

    def listen(self, *args):
        self._impl.listen(*args)

    def reload(self, *_):
        console.setProcTitle("[sandbox] File Server (finishing...)")
        if self.need_serve_after_stop:
            logger.info("Caught signal reloading... in progress\n")
            return

        logger.info("Caught signal reloading...\n")
        self.need_serve_after_stop = True

        ioloop = get_loop()
        # close listen socket
        ioloop.add_callback(self._impl.stop)
        # stop event loop
        ioloop.add_callback(ioloop.stop)


def main(args=sys.argv):
    global settings
    common.config.ensure_local_settings_defined()
    settings = common.config.Registry()
    service_user, _ = common.os.User.get_service_users(settings)

    console.setProcTitle("[sandbox] File Server")
    pidfile_name = os.path.join(settings.client.dirs.run, "fileserver.py.pid")
    log_name = os.path.join(settings.client.log.root, settings.client.fileserver.log.name)
    if common.os.User.has_root and os.path.exists(log_name):
        os.chown(log_name, service_user.uid, -1)
    with open(args[0], "rb"):
        # This is some kind of lock for daemon's files for afterlife
        with common.os.User.Privileges(service_user.login):
            os.environ["HOME"] = service_user.home  # "Fix" for strange behaviour on MacOS
            server = FileServer(os.path.dirname(args[0]))
            signal.signal(signal.SIGTERM, server.reload)
            signal.signal(signal.SIGINT, server.reload)
            signal.signal(signal.SIGUSR2, lambda *_: common.threading.dump_threads(logger))
            os.chdir(settings.client.tasks.data_dir)
            common.statistics.Signaler(common.statistics.ClientSignalHandler(), component=ctm.Component.FILESERVER)
            common.log.setup_log(log_name, "DEBUG")

            logger.info(
                "Initializing %s root privileges. PID is %d, listening port %d",
                "WITH" if common.os.User.has_root else "without", os.getpid(), settings.client.fileserver.port
            )
            server.listen(settings.client.fileserver.port)

            # hack to wake up main tornado loop
            signal.set_wakeup_fd(get_loop()._waker.writer.fileno())
            get_loop().start()

            if server.need_serve_after_stop:
                if os.path.exists(pidfile_name):
                    os.unlink(pidfile_name)
                get_pool().shutdown()


if __name__ == "__main__":
    main()
