parse_html.py 8.9 KB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020-02-04 14:34
# @Author  : Lemon
# @File    : _parse_html.py
# @Software: PyCharm
import json
import re

import bs4
import demjson
from bs4 import BeautifulSoup


def friend_unit(p: dict):
    next_url = None
    result = list()
    html = _unit_html(p)
    bs = BeautifulSoup(html, "html.parser")

    if 'objectListItem' in html:
        for li in bs.select('.objectListItem'):
            a_label = li.find_all('a', limit=2)
            try:
                src = a_label[0].find('img').attrs['src']
            except:
                src = None
            data = a_label[1].attrs
            thread_id = re.findall(r'user.php\?id=(\d+)&', data['data-hovercard'])[0]
            title = data['title']
            if thread_id and title:
                result.append({'fbid': thread_id, 'name': title, 'src': src})
    elif 'friendBrowserListUnit' in html:
        for li in bs.select('.friendBrowserListUnit'):
            a_label = li.find_all('a', limit=2)
            try:
                src = a_label[0].find('img').attrs['src']
            except:
                src = None
            data = a_label[1].attrs
            thread_id = re.findall(r'user.php\?id=(\d+)&', data['data-hovercard'])[0]
            title = data['title']
            if thread_id and title:
                result.append({'fbid': thread_id, 'name': title, 'src': src})
    elif 'friendBrowserContent' in html:
        for li in bs.select('.friendBrowserContent'):
            try:
                src = li.parent.parent.parent.parent.find('img').attrs['src']
            except:
                src = None
            data = li.find_all('a', limit=2)[0].attrs
            if 'data-hovercard' in data:
                thread_id = re.findall(r'user.php\?id=(\d+)&', data['data-hovercard'])[0]
                title = data['title']
                if thread_id and title:
                    result.append({'fbid': thread_id, 'name': title, 'src': src})

    if 'uiMorePagerPrimary' in html:
        atag = bs.find('a', attrs={'class': 'uiMorePagerPrimary'})
        next_url = atag.get('ajaxify')
    return result, next_url


def _unit_html(p):
    try:
        html = get_domops_3(p)
        assert html
    except:
        html = p.get('payload', {}).get('results', {}).get('__html', '')
    return html


def find_input_fields_with_pc(html):
    b = bs4.BeautifulSoup(html, "html.parser")
    login_form = b.select("form[id=login_form]")[0]
    return login_form.find_all("input")


def show_home_page(html):
    b = bs4.BeautifulSoup(html, "html.parser")
    return b


def get_domops_3(res):
    html = ""
    try:
        for domops in res.get('domops'):
            for x in domops:
                if isinstance(x, dict):
                    html = x['__html']
                    break
            if html:
                break
    except:
        pass
    return html


def get_overview_text(res):
    html = get_domops_3(res)
    msgs = []
    if html:
        FILTER = re.compile(r"Edit the places you've lived|Edit your work|Edit your education")
        b = bs4.BeautifulSoup(html, "html.parser")
        lis = b.find_all("div", class_="clearfix")
        for i, item in enumerate(lis):
            if i:
                text = FILTER.sub(lambda x: "", item.text)
                msgs.append(text)
    return "\n".join(msgs)


def get_current_city(res):
    html = get_domops_3(res)
    city_text = city_id = None
    if html:
        try:
            b = bs4.BeautifulSoup(html, 'html.parser')
            city = b.find('li', id='current_city')
            a_label = city.find('a')
            if a_label:
                url = "https://www.facebook.com" + a_label.attrs['data-hovercard']
                city_text = a_label.text
                city_id = re.findall(r"page.php\?id=(\d+)&", url)[0]
        except:
            pass
    return city_text, city_id


def get_user_info(b):
    pattern = re.compile(r"viewer_actor:(.*?)comment_count", re.MULTILINE | re.DOTALL)
    script = b.find("script", text=pattern)
    if not script:
        a_lable = b.find('a', attrs={'data-gt': '{"chrome_nav_item":"timeline_chrome"}'})
        name = a_lable.text
        url = a_lable.attrs.get('href')
        img_data = a_lable.find('img').attrs
        image = img_data.get('src')
        id = re.search("_header_(\d+)", a_lable.find('img').attrs.get('id')).group(1)
    else:
        info = pattern.search(script.text).group()
        id = re.findall(r'id:"(.*?)"', info)[0]
        name = re.findall(r',name:"(.*?)"', info)[0]
        url = re.findall(r'url:"(.*?)"', info)[0]
        image = re.findall(r'profile_picture_depth_0.*?uri:"(.*?)"', info)[0]
    try:
        r = re.compile('/p\d+x\d+/(.*?)\?')
        iname = r.findall(image)[0]
        pattern = re.compile('src="(https.*?/p\d{3,}x\d{3,}/%s.*?)"' % (iname), re.MULTILINE | re.DOTALL)
        elem = b.find('div', class_='hidden_elem', string=pattern)
        image = re.sub('&', lambda x: "&", pattern.findall(elem.string)[0])
    except:
        pass
    return id, name, url, image


def get_all_raw_id(text):
    fbids = []
    b = bs4.BeautifulSoup(text, 'html.parser')
    if 'raw_id\\\\":' in text:
        pattern = re.compile(r'raw_id\\\\":(\d+),')
        elem = b.find_all('div', class_='hidden_elem', string=pattern)
        for e in elem:
            key = pattern.findall(e.string)
            fbids.extend(key)
        fbids = list(set(fbids))
    else:
        r = re.compile(r'\\"raw_id\\":(\d+),')
        key = r.findall(text)
        fbids.extend(key)
    return fbids


def get_hidden_input(res):
    text = get_domops_3(res)
    b = bs4.BeautifulSoup(text, 'html.parser')
    ins = [x.attrs for x in b.find_all('input', type='hidden')]

    args = {}
    for x in ins:
        if 'name' in x and 'value' in x:
            args.update({x['name']: x['value']})
    return args


def get_div_text(html):
    b = bs4.BeautifulSoup(html, 'html.parser')
    return b.text


def get_friend_div(text):
    if 'TimelineAppCollectionListRenderer' in text:
        b = bs4.BeautifulSoup(text, 'html.parser')
        script = b.find('script', string=re.compile(r'TimelineAppCollectionListRenderer'))
        a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0]
        res = demjson.decode(a)
        return res
    elif 'friend_list_item' in text:
        data = {'items': {'edges': [], 'count': 0}}
        b = bs4.BeautifulSoup(text, 'html.parser')
        elem = b.find('div', class_='hidden_elem', string=re.compile(r'data-testid="friend_list_item"'))
        if elem:
            elem = bs4.BeautifulSoup(elem.string, 'html.parser')
            divs = elem.find_all('div', {"data-testid": "friend_list_item"})
            first_page = True
        else:
            first_page = False
            divs = b.find_all('li')

        for dd in divs:
            node = dict()
            item = dict()
            try:
                a_data = dd.find('a').attrs
                img_data = dd.find('img').attrs

                url = a_data.get('href')
                fbid = re.findall(r'user.php\?id=(\d+)&', a_data.get('data-hovercard', ""))[0]
                image = img_data.get('src')
                name = img_data.get('aria-label')
                item['image'] = {'uri': image}
                item['title'] = {'text': name}
                item['node'] = {'id': fbid, 'url': url}
                node['node'] = item
                data['items']['edges'].append(node)
            except:
                pass
        if first_page:
            script = b.find('script', string=re.compile('"MedleyPageletRequestData"'))
            page = re.compile(r'"MedleyPageletRequestData","set",\[\],\[(.*?)\]')
            ext_data = demjson.decode(page.search(script.string).group(1))

            load = re.compile('\["TimelineAppCollection","enableContentLoader".*?\]\]')
            script_126 = b.find('script', string=load)
            if script_126:
                tttt = load.search(script_126.string).group()
                ext_data.update({
                    'cursor': re.search(r'"([A-Za-z0-9-_]{50,200})"', tttt).group(1),
                    'collection_token': re.search(r'"pagelet_timeline_app_collection_(.*?)"', tttt).group(1),
                })
                data['items']['has_next_page'] = True
            else:
                data['items']['has_next_page'] = False

            data['items']['count'] = int(
                re.search(r'AllFriendsAppCollectionPagelet".*?,tab_count:(\d+),', text).group(1))

            ext_data['count'] = data['items']['count']
            data['items']['ext_data'] = ext_data
        else:
            data['items']['ext_data'] = "update"

        return data


def get_pagelet_info(res):
    y = {}
    for x in res['jsmods']['require']:
        if isinstance(x, list) and x[0] == 'TimelineAppCollection':
            y = x
            break
    try:
        cursor = y[3][2]
    except:
        cursor = None
    return cursor