From 344ebc8530ce51a5f0193b92604ab9281edc3dbf Mon Sep 17 00:00:00 2001 From: lemon <961222258@qq.com> Date: Fri, 21 Feb 2020 13:27:05 +0800 Subject: [PATCH] 兼容新旧版本fb页面的好友列表采集 --- lib/common.py | 21 +++++++++++++++++++-- lib/facebook.py | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------- utils/parse_html.py | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------- 3 files changed, 169 insertions(+), 44 deletions(-) diff --git a/lib/common.py b/lib/common.py index d7cff66..25d7089 100644 --- a/lib/common.py +++ b/lib/common.py @@ -6,7 +6,8 @@ # @Software: PyCharm import datetime - +import demjson +import base64 from enum import Enum from attr import attrs, attrib @@ -106,7 +107,7 @@ class Exchange(Enum): return None -def todict(obj,include:list=None): +def todict(obj, include: list = None): keys = dir(obj) res = {} if include: @@ -123,3 +124,19 @@ def todict(obj,include:list=None): value = getattr(obj, key) res[key] = value return res + + +def tobase64(obj): + if isinstance(obj, (dict, list)): + obj = demjson.encode(obj) + if isinstance(obj, str): + obj = obj.encode('utf-8') + if isinstance(obj, bytes): + bin = base64.b64encode(obj) + return str(bin, 'utf-8') + raise BaseException("must str,list,dict,bytes") + + +def frombase64(base): + string = base64.b64decode(base).decode() + return demjson.decode(string) diff --git a/lib/facebook.py b/lib/facebook.py index 8567bbb..d3e5276 100644 --- a/lib/facebook.py +++ b/lib/facebook.py @@ -17,7 +17,7 @@ import furl from fbchat import Client, ThreadType, Message, Sticker, FBchatUserError, _exception, log, _util from fbchat._state import State, session_factory, is_home -from lib import google_map +from lib import google_map, common from lib.common import WorkPlace, College from utils import parse_html, _attachment @@ -782,21 +782,39 @@ class FacebookClient(Client): def friendsList(self, ext_data=None): if ext_data: - data = {'av': self.uid, - 'fb_api_caller_class': 'RelayModern', - 'fb_api_req_friendly_name': 'ProfileCometAppCollectionListRendererPaginationQuery', - 'variables': json.dumps({"count": 20, - "cursor": ext_data['cursor'], - "search": None, "scale": 2, - "privacySelectorRenderLocation": "COMET_PROFILE_COLLECTIONS", - "id": ext_data['id']}), - 'doc_id': '2773917206008873'} - res = self._post('/api/graphql/', data) - res = res['data']['node'] + ext_data = common.frombase64(ext_data) + + if 'collection_token' in ext_data: + data = { + 'fb_dtsg_ag': self._state.fb_dtsg_ag, + 'data': demjson.encode(ext_data), + } + res = self._get('/ajax/pagelet/generic.php/AllFriendsAppCollectionPagelet', data) + + cursor = parse_html.get_pagelet_info(res) + ext_data.update({'cursor': cursor}) + + res = parse_html.get_friend_div(res.get('payload')) + res['items']['ext_data'] = ext_data + res['items']['count'] = ext_data['count'] + res['items']['has_next_page'] = not cursor is None + else: + data = {'av': self.uid, + 'fb_api_caller_class': 'RelayModern', + 'fb_api_req_friendly_name': 'ProfileCometAppCollectionListRendererPaginationQuery', + 'variables': json.dumps({"count": 20, + "cursor": ext_data['cursor'], + "search": None, "scale": 2, + "privacySelectorRenderLocation": "COMET_PROFILE_COLLECTIONS", + "id": ext_data['id']}), + 'doc_id': '2773917206008873'} + res = self._post('/api/graphql/', data) + res = res['data']['node'] + else: - res = self._state._session.get(self._state.page_url + '/friends') + res = self._state._session.get(self._state.page_url.replace('profile.php?id=', '') + '/friends') if res.status_code == 200 and '/friends' in res.url: - res = parse_html.get_frient_div(res.text) + res = parse_html.get_friend_div(res.text) if not res: respone = { 'items': [], @@ -820,15 +838,31 @@ class FacebookClient(Client): } friends.append(data) - respone = { - 'items': friends, - 'count': count, - 'ext_data': { - 'id': res['id'], - 'cursor': res['items']['page_info']['end_cursor'] - }, - 'has_next_page': res['items']['page_info']['has_next_page'] - } + if 'page_info' in res['items']: + respone = { + 'items': friends, + 'count': count, + 'ext_data': { + 'id': res['id'], + 'cursor': res['items']['page_info']['end_cursor'] + }, + 'has_next_page': res['items']['page_info']['has_next_page'] + } + elif 'ext_data' in res['items']: + respone = { + 'items': friends, + 'count': count, + 'ext_data': res['items']['ext_data'], + 'has_next_page': res['items']['has_next_page'], + } + else: + respone = { + 'items': [], + 'count': 0, + 'ext_data': {}, + 'has_next_page': False + } + respone['ext_data'] = common.tobase64(respone['ext_data']) return respone def changePwd(self, old, new): diff --git a/utils/parse_html.py b/utils/parse_html.py index 07deb63..af6e8cb 100644 --- a/utils/parse_html.py +++ b/utils/parse_html.py @@ -130,22 +130,28 @@ def get_current_city(res): def get_user_info(b): pattern = re.compile(r"viewer_actor:(.*?)comment_count", re.MULTILINE | re.DOTALL) script = b.find("script", text=pattern) - if script: + if not script: + a_lable = b.find('a', attrs={'data-gt': '{"chrome_nav_item":"timeline_chrome"}'}) + name = a_lable.text + url = a_lable.attrs.get('href') + img_data = a_lable.find('img').attrs + image = img_data.get('src') + id = re.search("_header_(\d+)", a_lable.find('img').attrs.get('id')).group(1) + else: info = pattern.search(script.text).group() id = re.findall(r'id:"(.*?)"', info)[0] name = re.findall(r',name:"(.*?)"', info)[0] url = re.findall(r'url:"(.*?)"', info)[0] image = re.findall(r'profile_picture_depth_0.*?uri:"(.*?)"', info)[0] - try: - r = re.compile('/p\d+x\d+/(.*?)\?') - iname = r.findall(image)[0] - pattern = re.compile('src="(https.*?/p\d{3,}x\d{3,}/%s.*?)"' % (iname), re.MULTILINE | re.DOTALL) - elem = b.find('div', class_='hidden_elem', string=pattern) - image = re.sub('&', lambda x: "&", pattern.findall(elem.string)[0]) - except: - pass - return id, name, url, image - return None, None, None, None + try: + r = re.compile('/p\d+x\d+/(.*?)\?') + iname = r.findall(image)[0] + pattern = re.compile('src="(https.*?/p\d{3,}x\d{3,}/%s.*?)"' % (iname), re.MULTILINE | re.DOTALL) + elem = b.find('div', class_='hidden_elem', string=pattern) + image = re.sub('&', lambda x: "&", pattern.findall(elem.string)[0]) + except: + pass + return id, name, url, image def get_all_raw_id(text): @@ -182,11 +188,79 @@ def get_div_text(html): return b.text -def get_frient_div(text): - if not 'ProfileCometAppSectionFriendsList' in text: - return None - b = bs4.BeautifulSoup(text, 'html.parser') - script = b.find('script', string=re.compile(r'ProfileCometAppSectionFriendsList')) - a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0] - res = demjson.decode(a) - return res +def get_friend_div(text): + if 'ProfileCometAppSectionFriendsList' in text: + b = bs4.BeautifulSoup(text, 'html.parser') + script = b.find('script', string=re.compile(r'ProfileCometAppSectionFriendsList')) + a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0] + res = demjson.decode(a) + return res + elif 'friend_list_item' in text: + data = {'items': {'edges': [], 'count': 0}} + b = bs4.BeautifulSoup(text, 'html.parser') + elem = b.find('div', class_='hidden_elem', string=re.compile(r'data-testid="friend_list_item"')) + if elem: + elem = bs4.BeautifulSoup(elem.string, 'html.parser') + divs = elem.find_all('div', {"data-testid": "friend_list_item"}) + first_page = True + else: + first_page = False + divs = b.find_all('li') + + for dd in divs: + node = dict() + item = dict() + try: + a_data = dd.find('a').attrs + img_data = dd.find('img').attrs + + url = a_data.get('href') + fbid = re.findall(r'user.php\?id=(\d+)&', a_data.get('data-hovercard', ""))[0] + image = img_data.get('src') + name = img_data.get('aria-label') + item['image'] = {'uri': image} + item['title'] = {'text': name} + item['node'] = {'id': fbid, 'url': url} + node['node'] = item + data['items']['edges'].append(node) + except: + pass + if first_page: + script = b.find('script', string=re.compile('"MedleyPageletRequestData"')) + page = re.compile(r'"MedleyPageletRequestData","set",\[\],\[(.*?)\]') + ext_data = demjson.decode(page.search(script.string).group(1)) + + load = re.compile('\["TimelineAppCollection","enableContentLoader".*?\]\]') + script_126 = b.find('script', string=load) + if script_126: + tttt = load.search(script_126.string).group() + ext_data.update({ + 'cursor': re.search(r'"([A-Za-z0-9-_]{50,200})"', tttt).group(1), + 'collection_token': re.search(r'"pagelet_timeline_app_collection_(.*?)"', tttt).group(1), + }) + data['items']['has_next_page'] = True + else: + data['items']['has_next_page'] = False + + data['items']['count'] = int( + re.search(r'AllFriendsAppCollectionPagelet".*?,tab_count:(\d+),', text).group(1)) + + ext_data['count'] = data['items']['count'] + data['items']['ext_data'] = ext_data + else: + data['items']['ext_data'] = "update" + + return data + + +def get_pagelet_info(res): + y = {} + for x in res['jsmods']['require']: + if isinstance(x, list) and x[0] == 'TimelineAppCollection': + y = x + break + try: + cursor = y[3][2] + except: + cursor = None + return cursor -- libgit2 0.24.0