use hahn;
pragma yt.Pool="robot-webmaster";

$tokenizer = TextProcessing::MakeTokenizer(
    True as Lowercasing,
    True as Lemmatizing,
    "BySense" as SeparatorType,
    AsList("Word") as TokenTypes
);

$tokenize_text = ($text) -> {
    return String::JoinFromList(
        ListFilter(
            ListMap($tokenizer($text), ($x) -> { return $x.Token; }),
            ($token) -> { return length($token) > 2; }
        ),
        ",",
    );
};

insert into `//home/webmaster/users/lester/SS15208/dataset_full`
    with truncate
select 
    cv.candidate_id as CandidateId,
    cv.attachment_id as AttachmentId,
    coalesce(r.resolution, "unknown") as Resolution,
    cv.text as Text,
    $tokenize_text(cv.text) as TextTokens,
    cv.full_name as FullName,
    coalesce(s.HitsHow, "") as HitsHow,
    coalesce(s.HitsWhat, "") as HitsWhat,
    coalesce(s.HitsWhere, "") as HitsWhere,
    coalesce(s.HitsWho, "") as HitsWho,
    coalesce(s.HitsTotal, 0) as HitsTotal,
    cv.source_table as SourceTable
from `//home/webmaster/users/lester/SS15208/cvs` as cv
left join `//home/webmaster/users/lester/SS15208/resolutions` as r
    on r.candidate_id == cv.candidate_id
    and r.attachment_id == cv.attachment_id
left join `//home/webmaster/prod/seocheck/processed/snapshot` as s
    on r.candidate_id == s.CandidateId
    and r.attachment_id == s.AttachmentId
order by
    CandidateId,
    AttachmentId
