#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import time
import json
import sys
import traceback
import email.utils
import hashlib
from xml.etree import ElementTree as ET
from os.path import isfile, basename
from os import remove, walk, environ
from datetime import datetime
from urlparse import urlparse, urljoin
from urllib import quote

import requests

from hosted import config

PLACEHOLDER_AVATAR = 'placeholder-avatar.png'

UA = 'info-beamer nitter rss poller serial {}'.format(environ.get('SERIAL', 'unknown'))
ACCEPT_RSS = 'application/rss+xml, application/atom+xml, text/xml, */*'

session = requests.Session()
session.headers = {
    'User-Agent': UA,
    'Accept': ACCEPT_RSS,
}


def local_tag(tag):
    if tag.startswith('{'):
        return tag.split('}', 1)[-1]
    return tag


def _u(s):
    if s is None:
        return u''
    if isinstance(s, unicode):
        return s
    return s.decode('utf-8', 'replace')


def html_to_text(s):
    string = _u(s)
    for source, replacement in {
        u'&nbsp;': u' ',
        u'&#160;': u' ',
        u'&amp;': u'&',
        u'&#38;': u'&',
        u'&gt;': u'>',
        u'&#62;': u'>',
        u'&lt;': u'<',
        u'&#60;': u'<',
        u'&quot;': u'"',
        u'&#34;': u'"',
        u'&apos;': u'\'',
        u'&#39;': u'\'',
        u'<br>': u' ',
        u'<br/>': u' ',
        u'<br />': u' ',
        u'</p><p>': u' ',
    }.items():
        string = string.replace(source, replacement)
    string = re.sub(u'<[^>]+>', u'', string)
    return string.strip()


def elem_plain_text(el):
    parts = [el.text or u'']
    for sub in el:
        parts.append(elem_plain_text(sub))
        if sub.tail:
            parts.append(sub.tail)
    return u''.join(parts)


def child_text_by_tag(parent, tag_names):
    for el in parent:
        if local_tag(el.tag) in tag_names:
            return elem_plain_text(el).strip()
    return u''


def child_html_raw(parent, tag_names):
    for el in parent:
        if local_tag(el.tag) in tag_names:
            raw = ET.tostring(el, encoding='utf-8', method='html')
            return _u(raw)
    return u''


def item_link(item):
    for el in item:
        if local_tag(el.tag) != 'link':
            continue
        href = el.get('href')
        if href:
            return href.strip()
        if el.text:
            return el.text.strip()
    return u''


def item_guid(item):
    for el in item:
        if local_tag(el.tag) == 'guid' and el.text:
            return el.text.strip()
    return u''


def item_timestamp(item):
    for el in item:
        lt = local_tag(el.tag)
        if lt == 'pubDate':
            raw = (el.text or '').strip()
            if raw:
                t = email.utils.parsedate_tz(raw)
                if t:
                    return int(email.utils.mktime_tz(t))
        if lt in ('published', 'updated'):
            raw = (el.text or '').strip()
            if raw:
                t = email.utils.parsedate_tz(raw)
                if t:
                    return int(email.utils.mktime_tz(t))
    return int(time.time())


def item_title(item):
    return child_text_by_tag(item, ('title',))


def item_description_html(item):
    h = child_html_raw(item, ('description', 'summary', 'content'))
    if h:
        return h
    return child_text_by_tag(item, ('description', 'summary', 'content'))


def item_enclosure_image(item):
    for el in item:
        if local_tag(el.tag) != 'enclosure':
            continue
        typ = (el.get('type') or '').lower()
        if typ.startswith('image/'):
            url = el.get('url')
            if url:
                return url.strip()
    return None


def iter_feed_items(root):
    items = [el for el in root.iter() if local_tag(el.tag) == 'item']
    if items:
        for it in items:
            yield it
        return
    for el in root.iter():
        if local_tag(el.tag) == 'entry':
            yield el


def parse_title_meta(title):
    title = _u(title)
    m = re.search(ur'\(@([A-Za-z0-9_]+)\)\s*:', title)
    if m:
        acct = m.group(1)
        display = title[:m.start()].strip()
        return acct, display
    m = re.search(ur'^@([A-Za-z0-9_]+)\s*:', title)
    if m:
        return m.group(1), u''
    return None, u''


def item_creator(item):
    for el in item:
        if local_tag(el.tag) == 'creator':
            t = (el.text or '').strip()
            if t:
                return t
    for el in item:
        if local_tag(el.tag) != 'author':
            continue
        for c in el:
            if local_tag(c.tag) == 'name' and c.text:
                return c.text.strip()
    return u''


def normalize_handle(s):
    s = _u(s).strip().lower()
    if s.startswith(u'@'):
        s = s[1:]
    if u'@' in s:
        s = s.split(u'@')[0]
    return s


def build_blocked_map():
    blocked = {}
    try:
        with open('blocked.txt') as f:
            for line in f.read().strip().splitlines():
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                if '#' in line:
                    acct, reason = line.split('#', 1)
                else:
                    acct, reason = line, 'unknown'
                hn = normalize_handle(acct)
                if hn:
                    blocked[hn] = reason.strip()
    except Exception:
        pass
    for part in config.filter_accounts.split(','):
        hn = normalize_handle(part)
        if hn:
            blocked[hn] = 'specified manually'
    return blocked


def extract_img_srcs(html):
    html = _u(html)
    return re.findall(ur'<img[^>]+src=["\']([^"\']+)["\']', html, re.I)


def resolve_url(base, url):
    url = _u(url).strip()
    if url.startswith('//'):
        url = 'https:' + url
    base = _u(base).rstrip('/') + '/'
    return urljoin(base, url)


def cache_image(url, http_session):
    url = _u(url).strip()
    if not url:
        return None
    path = urlparse(url.encode('utf-8')).path
    image_filename = basename(path)
    if not image_filename or u'.' not in image_filename:
        image_filename = hashlib.md5(url.encode('utf-8')).hexdigest()[:20] + u'.jpg'
    if not isfile(image_filename):
        try:
            with open(image_filename, 'wb') as f:
                r = http_session.get(url.encode('utf-8'))
                r.raise_for_status()
                f.write(r.content)
        except Exception as e:
            sys.stderr.write("[nitter] exception while caching image {} - {}\n".format(url, e))
            return None
    return image_filename


def build_feed_url(nitter_base, feed_token):
    base = _u(nitter_base).strip().rstrip(u'/')
    tok = feed_token.strip()
    if not tok:
        return None
    if tok.startswith('@'):
        user = tok[1:].strip()
        if not user:
            return None
        path = quote(user.encode('utf-8'), safe='A-Za-z0-9_.-')
        return u'{}/{}'.format(base, path) + u'/rss'
    q = quote(tok.encode('utf-8'), safe='')
    return u'{}/search/rss?f=tweets&q={}'.format(base, q)


class NitterStatus(object):
    def __init__(self, item_id, created_ts, text, acct, display_name, avatar_url, media_url, title_raw, desc_raw, blocked):
        self.item_id = item_id
        self.created_timestamp = created_ts
        self.text = text
        self.acct = acct
        self.display_name = display_name or acct
        self._avatar_url = avatar_url
        self._media_url = media_url
        self._title_raw = title_raw
        self._desc_raw = desc_raw
        self._blocked = blocked
        self.profile_image = None
        self.attachment_image = None

    def _log(self, text):
        sys.stderr.write("[nitter] [tweet {}] {}\n".format(self.item_id, text))

    @property
    def is_retweet(self):
        t = _u(self._title_raw).lstrip()
        if t.upper().startswith(u'RT @'):
            return True
        if u'RT @' in _u(self._desc_raw)[:80]:
            return True
        return False

    @property
    def is_blocked(self):
        hn = normalize_handle(self.acct)
        if hn in self._blocked:
            self._log("account is blocked: {}".format(self._blocked[hn]))
            return True
        return False

    @property
    def ignore_post(self):
        if config.filter_retweets and self.is_retweet:
            self._log("Filtered: retweet")
            return True
        if not self.text:
            self._log("Filtered: empty content")
            return True
        if self.is_blocked:
            return True
        return False

    def cache_images(self, http_session, nitter_base):
        if self._avatar_url:
            self.profile_image = cache_image(self._avatar_url, http_session)
        if not self.profile_image:
            self.profile_image = PLACEHOLDER_AVATAR
        if self._media_url:
            self.attachment_image = cache_image(self._media_url, http_session)
        else:
            self.attachment_image = None


class NitterFetcher(object):
    def __init__(self):
        self.not_before = int(time.mktime(datetime.strptime(config.not_before, "%Y-%m-%d").timetuple()))
        self.posts_by_id = {}
        self.nitter_base = _u(config.nitter_base).strip()
        self.blocked = build_blocked_map()

    def _log(self, something, feed=None):
        if not isinstance(something, basestring):
            something = repr(something)
        if feed is not None:
            something = u"[{}] {}".format(feed, something)
        sys.stderr.write(u"[nitter] {}\n".format(something).encode('utf-8'))

    def _fetch_rss(self, url):
        self._log("GET {}".format(url))
        r = session.get(url.encode('utf-8'))
        self._log("http status code was {}".format(r.status_code))
        r.raise_for_status()
        return r.content

    def _parse_items(self, xml_bytes, feed_label):
        root = ET.fromstring(xml_bytes)
        out = []
        for item in iter_feed_items(root):
            link = _u(item_link(item))
            guid = _u(item_guid(item))
            item_id = guid or link
            if not item_id:
                continue
            title = item_title(item)
            desc_html = item_description_html(item)
            body_html = desc_html
            if body_html:
                body_html = re.sub(ur'^<description[^>]*>', u'', body_html, flags=re.I)
                body_html = re.sub(ur'</description>\s*$', u'', body_html, flags=re.I)
            text = html_to_text(body_html)
            if not text:
                text = html_to_text(title)
            acct, display = parse_title_meta(title)
            if not acct:
                acct = normalize_handle(item_creator(item)) or u'unknown'
            if not display:
                display = u''
            imgs = extract_img_srcs(desc_html)
            avatar_url = None
            media_url = item_enclosure_image(item)
            if media_url:
                media_url = resolve_url(self.nitter_base, media_url)
            if imgs:
                first = resolve_url(self.nitter_base, imgs[0])
                if len(imgs) > 1:
                    avatar_url = first
                    if not media_url:
                        media_url = resolve_url(self.nitter_base, imgs[1])
                else:
                    if u'profile_images' in imgs[0] or u'profile_images' in first:
                        avatar_url = first
                    elif not media_url:
                        media_url = first
                    else:
                        avatar_url = first
            ts = item_timestamp(item)
            st = NitterStatus(
                item_id, ts, text, acct, display, avatar_url, media_url,
                title, desc_html, self.blocked,
            )
            out.append(st)
        return out

    def fetch_feed(self, feed_token):
        url = build_feed_url(self.nitter_base, feed_token)
        if not url:
            return
        label = feed_token.strip()
        try:
            xml_bytes = self._fetch_rss(url)
            for st in self._parse_items(xml_bytes, label):
                if st.item_id in self.posts_by_id:
                    continue
                if st.ignore_post:
                    continue
                if st.created_timestamp < self.not_before:
                    self._log(
                        u"ignoring {} (too old)".format(st.item_id),
                        feed=label,
                    )
                    continue
                self.posts_by_id[st.item_id] = st
        except Exception as e:
            self._log(e, feed=label)

    def cache_and_cleanup_images(self):
        images = {
            PLACEHOLDER_AVATAR,
            'mastodon-logo.png',
            'node.png',
            'package.png',
            'package-header.jpg',
        }
        to_delete = set()
        merged = self.posts_by_id.values()
        for post in merged:
            post.cache_images(session, self.nitter_base)
            for path in (post.profile_image, post.attachment_image):
                if path is not None:
                    images.add(path)
        for root, dirs, files in walk('./'):
            for name in files:
                if (
                    name.endswith('png')
                    or name.endswith('jpg')
                    or name.endswith('jpeg')
                    or name.endswith('gif')
                ) and name not in images:
                    to_delete.add(name)
        for path in to_delete:
            self._log("deleting image {} because it's not used anywhere".format(path))
            remove(path)

    def dump_json(self):
        merged = self.posts_by_id.values()
        merged.sort(key=lambda p: p.created_timestamp, reverse=True)
        limit = int(config.count)
        merged = merged[:limit]
        out = []
        for post in merged:
            out.append({
                'created_at': post.created_timestamp,
                'content': post.text,
                'account': {
                    'acct': post.acct,
                    'display_name': post.display_name,
                    'avatar_static': post.profile_image,
                },
                'media_attachment': post.attachment_image or '',
            })
        self._log("have gotten {} posts in total".format(len(out)))
        if len(out) > 0:
            with open('tootlist.json', 'wb') as f:
                f.write(json.dumps(out, ensure_ascii=False).encode('utf8'))
            self._log("wrote {} posts to tootlist.json".format(len(out)))


def main():
    config.restart_on_update()

    if config.poll_interval == 0:
        print >>sys.stderr, "waiting for a config change"
        while 1:
            time.sleep(100000)

    while 1:
        try:
            poller = NitterFetcher()
            for part in config.feeds.split(','):
                poller.fetch_feed(part.strip())
            poller.cache_and_cleanup_images()
            poller.dump_json()
        except Exception:
            traceback.print_exc()
            time.sleep(60)
        else:
            time.sleep(60 * config.poll_interval)


if __name__ == "__main__":
    main()
