parse_html.py 6.0 KB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020-02-04 14:34
# @Author  : Lemon
# @File    : _parse_html.py
# @Software: PyCharm
import json
import re

import bs4
import demjson
from bs4 import BeautifulSoup


def friend_unit(p: dict):
    next_url = None
    result = list()
    html = _unit_html(p)
    bs = BeautifulSoup(html, "html.parser")

    if 'objectListItem' in html:
        for li in bs.select('.objectListItem'):
            a_label = li.find_all('a', limit=2)
            try:
                src = a_label[0].find('img').attrs['src']
            except:
                src = None
            data = a_label[1].attrs
            thread_id = re.findall(r'user.php\?id=(\d+)&', data['data-hovercard'])[0]
            title = data['title']
            if thread_id and title:
                result.append({'fbid': thread_id, 'name': title, 'src': src})
    elif 'friendBrowserListUnit' in html:
        for li in bs.select('.friendBrowserListUnit'):
            a_label = li.find_all('a', limit=2)
            try:
                src = a_label[0].find('img').attrs['src']
            except:
                src = None
            data = a_label[1].attrs
            thread_id = re.findall(r'user.php\?id=(\d+)&', data['data-hovercard'])[0]
            title = data['title']
            if thread_id and title:
                result.append({'fbid': thread_id, 'name': title, 'src': src})
    elif 'friendBrowserContent' in html:
        for li in bs.select('.friendBrowserContent'):
            try:
                src = li.parent.parent.parent.parent.find('img').attrs['src']
            except:
                src = None
            data = li.find_all('a', limit=2)[0].attrs
            if 'data-hovercard' in data:
                thread_id = re.findall(r'user.php\?id=(\d+)&', data['data-hovercard'])[0]
                title = data['title']
                if thread_id and title:
                    result.append({'fbid': thread_id, 'name': title, 'src': src})

    if 'uiMorePagerPrimary' in html:
        atag = bs.find('a', attrs={'class': 'uiMorePagerPrimary'})
        next_url = atag.get('ajaxify')
    return result, next_url


def _unit_html(p):
    try:
        html = get_domops_3(p)
        assert html
    except:
        html = p.get('payload', {}).get('results', {}).get('__html', '')
    return html


def find_input_fields_with_pc(html):
    b = bs4.BeautifulSoup(html, "html.parser")
    login_form = b.select("form[id=login_form]")[0]
    return login_form.find_all("input")


def show_home_page(html):
    b = bs4.BeautifulSoup(html, "html.parser")
    return b


def get_domops_3(res):
    html = ""
    try:
        for domops in res.get('domops'):
            for x in domops:
                if isinstance(x, dict):
                    html = x['__html']
                    break
            if html:
                break
    except:
        pass
    return html


def get_overview_text(res):
    html = get_domops_3(res)
    msgs = []
    if html:
        FILTER = re.compile(r"Edit the places you've lived|Edit your work|Edit your education")
        b = bs4.BeautifulSoup(html, "html.parser")
        lis = b.find_all("div", class_="clearfix")
        for i, item in enumerate(lis):
            if i:
                text = FILTER.sub(lambda x: "", item.text)
                msgs.append(text)
    return "\n".join(msgs)


def get_current_city(res):
    html = get_domops_3(res)
    city_text = city_id = None
    if html:
        try:
            b = bs4.BeautifulSoup(html, 'html.parser')
            city = b.find('li', id='current_city')
            a_label = city.find('a')
            if a_label:
                url = "https://www.facebook.com" + a_label.attrs['data-hovercard']
                city_text = a_label.text
                city_id = re.findall(r"page.php\?id=(\d+)&", url)[0]
        except:
            pass
    return city_text, city_id


def get_user_info(b):
    pattern = re.compile(r"viewer_actor:(.*?)comment_count", re.MULTILINE | re.DOTALL)
    script = b.find("script", text=pattern)
    if script:
        info = pattern.search(script.text).group()
        id = re.findall(r'id:"(.*?)"', info)[0]
        name = re.findall(r',name:"(.*?)"', info)[0]
        url = re.findall(r'url:"(.*?)"', info)[0]
        image = re.findall(r'profile_picture_depth_0.*?uri:"(.*?)"', info)[0]
        try:
            r = re.compile('/p\d+x\d+/(.*?)\?')
            iname = r.findall(image)[0]
            pattern = re.compile('src="(https.*?/p\d{3,}x\d{3,}/%s.*?)"' % (iname), re.MULTILINE | re.DOTALL)
            elem = b.find('div', class_='hidden_elem', string=pattern)
            image = re.sub('&', lambda x: "&", pattern.findall(elem.string)[0])
        except:
            pass
        return id, name, url, image
    return None, None, None, None


def get_all_raw_id(text):
    fbids = []
    b = bs4.BeautifulSoup(text, 'html.parser')
    if 'raw_id\\\\":' in text:
        pattern = re.compile(r'raw_id\\\\":(\d+),')
        elem = b.find_all('div', class_='hidden_elem', string=pattern)
        for e in elem:
            key = pattern.findall(e.string)
            fbids.extend(key)
        fbids = list(set(fbids))
    else:
        r = re.compile(r'\\"raw_id\\":(\d+),')
        key = r.findall(text)
        fbids.extend(key)
    return fbids


def get_hidden_input(res):
    text = get_domops_3(res)
    b = bs4.BeautifulSoup(text, 'html.parser')
    ins = [x.attrs for x in b.find_all('input', type='hidden')]

    args = {}
    for x in ins:
        if 'name' in x and 'value' in x:
            args.update({x['name']: x['value']})
    return args


def get_div_text(html):
    b = bs4.BeautifulSoup(html, 'html.parser')
    return b.text


def get_frient_div(text):
    if not 'ProfileCometAppSectionFriendsList' in text:
        return None
    b = bs4.BeautifulSoup(text, 'html.parser')
    script = b.find('script', string=re.compile(r'ProfileCometAppSectionFriendsList'))
    a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0]
    res = demjson.decode(a)
    return res