from bs4 import BeautifulSoup

import yt.wrapper as yt


def find_amp_serp_elements(html_with_seanid, filter_amp_only):
    ret = []
    soup = BeautifulSoup(html_with_seanid, 'html.parser', from_encoding="utf8")
    for serp_item in soup.find_all(lambda t: t.attrs.get('seanid')):
        has_amp = bool(serp_item.find(lambda t: t.attrs.get('data-amp')))
        if filter_amp_only and not has_amp:
            continue
        ret.append(serp_item.attrs.get('seanid'))
    return ret


def find_amp_mapper(rec):
    amp_seanid_list = find_amp_serp_elements(rec['html_with_seanid'], True)
    for seanid in amp_seanid_list:
        yield dict(
            serp_id=rec['id'],
            query_text=rec['query_text'],
            seanid=seanid
        )


if __name__ == '__main__':
    yt.run_map(find_amp_mapper,
        '//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190827_mixed_sample_parsed',
        '//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190827_mixed_sample_amp_seanid_list'
    )
