作者 lemon

兼容新旧版本fb页面的好友列表采集

@@ -6,7 +6,8 @@ @@ -6,7 +6,8 @@
6 # @Software: PyCharm 6 # @Software: PyCharm
7 7
8 import datetime 8 import datetime
9 - 9 +import demjson
  10 +import base64
10 from enum import Enum 11 from enum import Enum
11 from attr import attrs, attrib 12 from attr import attrs, attrib
12 13
@@ -106,7 +107,7 @@ class Exchange(Enum): @@ -106,7 +107,7 @@ class Exchange(Enum):
106 return None 107 return None
107 108
108 109
109 -def todict(obj,include:list=None): 110 +def todict(obj, include: list = None):
110 keys = dir(obj) 111 keys = dir(obj)
111 res = {} 112 res = {}
112 if include: 113 if include:
@@ -123,3 +124,19 @@ def todict(obj,include:list=None): @@ -123,3 +124,19 @@ def todict(obj,include:list=None):
123 value = getattr(obj, key) 124 value = getattr(obj, key)
124 res[key] = value 125 res[key] = value
125 return res 126 return res
  127 +
  128 +
  129 +def tobase64(obj):
  130 + if isinstance(obj, (dict, list)):
  131 + obj = demjson.encode(obj)
  132 + if isinstance(obj, str):
  133 + obj = obj.encode('utf-8')
  134 + if isinstance(obj, bytes):
  135 + bin = base64.b64encode(obj)
  136 + return str(bin, 'utf-8')
  137 + raise BaseException("must str,list,dict,bytes")
  138 +
  139 +
  140 +def frombase64(base):
  141 + string = base64.b64decode(base).decode()
  142 + return demjson.decode(string)
@@ -17,7 +17,7 @@ import furl @@ -17,7 +17,7 @@ import furl
17 from fbchat import Client, ThreadType, Message, Sticker, FBchatUserError, _exception, log, _util 17 from fbchat import Client, ThreadType, Message, Sticker, FBchatUserError, _exception, log, _util
18 from fbchat._state import State, session_factory, is_home 18 from fbchat._state import State, session_factory, is_home
19 19
20 -from lib import google_map 20 +from lib import google_map, common
21 from lib.common import WorkPlace, College 21 from lib.common import WorkPlace, College
22 from utils import parse_html, _attachment 22 from utils import parse_html, _attachment
23 23
@@ -782,6 +782,23 @@ class FacebookClient(Client): @@ -782,6 +782,23 @@ class FacebookClient(Client):
782 782
783 def friendsList(self, ext_data=None): 783 def friendsList(self, ext_data=None):
784 if ext_data: 784 if ext_data:
  785 + ext_data = common.frombase64(ext_data)
  786 +
  787 + if 'collection_token' in ext_data:
  788 + data = {
  789 + 'fb_dtsg_ag': self._state.fb_dtsg_ag,
  790 + 'data': demjson.encode(ext_data),
  791 + }
  792 + res = self._get('/ajax/pagelet/generic.php/AllFriendsAppCollectionPagelet', data)
  793 +
  794 + cursor = parse_html.get_pagelet_info(res)
  795 + ext_data.update({'cursor': cursor})
  796 +
  797 + res = parse_html.get_friend_div(res.get('payload'))
  798 + res['items']['ext_data'] = ext_data
  799 + res['items']['count'] = ext_data['count']
  800 + res['items']['has_next_page'] = not cursor is None
  801 + else:
785 data = {'av': self.uid, 802 data = {'av': self.uid,
786 'fb_api_caller_class': 'RelayModern', 803 'fb_api_caller_class': 'RelayModern',
787 'fb_api_req_friendly_name': 'ProfileCometAppCollectionListRendererPaginationQuery', 804 'fb_api_req_friendly_name': 'ProfileCometAppCollectionListRendererPaginationQuery',
@@ -793,10 +810,11 @@ class FacebookClient(Client): @@ -793,10 +810,11 @@ class FacebookClient(Client):
793 'doc_id': '2773917206008873'} 810 'doc_id': '2773917206008873'}
794 res = self._post('/api/graphql/', data) 811 res = self._post('/api/graphql/', data)
795 res = res['data']['node'] 812 res = res['data']['node']
  813 +
796 else: 814 else:
797 - res = self._state._session.get(self._state.page_url + '/friends') 815 + res = self._state._session.get(self._state.page_url.replace('profile.php?id=', '') + '/friends')
798 if res.status_code == 200 and '/friends' in res.url: 816 if res.status_code == 200 and '/friends' in res.url:
799 - res = parse_html.get_frient_div(res.text) 817 + res = parse_html.get_friend_div(res.text)
800 if not res: 818 if not res:
801 respone = { 819 respone = {
802 'items': [], 820 'items': [],
@@ -820,6 +838,7 @@ class FacebookClient(Client): @@ -820,6 +838,7 @@ class FacebookClient(Client):
820 } 838 }
821 friends.append(data) 839 friends.append(data)
822 840
  841 + if 'page_info' in res['items']:
823 respone = { 842 respone = {
824 'items': friends, 843 'items': friends,
825 'count': count, 844 'count': count,
@@ -829,6 +848,21 @@ class FacebookClient(Client): @@ -829,6 +848,21 @@ class FacebookClient(Client):
829 }, 848 },
830 'has_next_page': res['items']['page_info']['has_next_page'] 849 'has_next_page': res['items']['page_info']['has_next_page']
831 } 850 }
  851 + elif 'ext_data' in res['items']:
  852 + respone = {
  853 + 'items': friends,
  854 + 'count': count,
  855 + 'ext_data': res['items']['ext_data'],
  856 + 'has_next_page': res['items']['has_next_page'],
  857 + }
  858 + else:
  859 + respone = {
  860 + 'items': [],
  861 + 'count': 0,
  862 + 'ext_data': {},
  863 + 'has_next_page': False
  864 + }
  865 + respone['ext_data'] = common.tobase64(respone['ext_data'])
832 return respone 866 return respone
833 867
834 def changePwd(self, old, new): 868 def changePwd(self, old, new):
@@ -130,7 +130,14 @@ def get_current_city(res): @@ -130,7 +130,14 @@ def get_current_city(res):
130 def get_user_info(b): 130 def get_user_info(b):
131 pattern = re.compile(r"viewer_actor:(.*?)comment_count", re.MULTILINE | re.DOTALL) 131 pattern = re.compile(r"viewer_actor:(.*?)comment_count", re.MULTILINE | re.DOTALL)
132 script = b.find("script", text=pattern) 132 script = b.find("script", text=pattern)
133 - if script: 133 + if not script:
  134 + a_lable = b.find('a', attrs={'data-gt': '{"chrome_nav_item":"timeline_chrome"}'})
  135 + name = a_lable.text
  136 + url = a_lable.attrs.get('href')
  137 + img_data = a_lable.find('img').attrs
  138 + image = img_data.get('src')
  139 + id = re.search("_header_(\d+)", a_lable.find('img').attrs.get('id')).group(1)
  140 + else:
134 info = pattern.search(script.text).group() 141 info = pattern.search(script.text).group()
135 id = re.findall(r'id:"(.*?)"', info)[0] 142 id = re.findall(r'id:"(.*?)"', info)[0]
136 name = re.findall(r',name:"(.*?)"', info)[0] 143 name = re.findall(r',name:"(.*?)"', info)[0]
@@ -145,7 +152,6 @@ def get_user_info(b): @@ -145,7 +152,6 @@ def get_user_info(b):
145 except: 152 except:
146 pass 153 pass
147 return id, name, url, image 154 return id, name, url, image
148 - return None, None, None, None  
149 155
150 156
151 def get_all_raw_id(text): 157 def get_all_raw_id(text):
@@ -182,11 +188,79 @@ def get_div_text(html): @@ -182,11 +188,79 @@ def get_div_text(html):
182 return b.text 188 return b.text
183 189
184 190
185 -def get_frient_div(text):  
186 - if not 'ProfileCometAppSectionFriendsList' in text:  
187 - return None 191 +def get_friend_div(text):
  192 + if 'ProfileCometAppSectionFriendsList' in text:
188 b = bs4.BeautifulSoup(text, 'html.parser') 193 b = bs4.BeautifulSoup(text, 'html.parser')
189 script = b.find('script', string=re.compile(r'ProfileCometAppSectionFriendsList')) 194 script = b.find('script', string=re.compile(r'ProfileCometAppSectionFriendsList'))
190 a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0] 195 a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0]
191 res = demjson.decode(a) 196 res = demjson.decode(a)
192 return res 197 return res
  198 + elif 'friend_list_item' in text:
  199 + data = {'items': {'edges': [], 'count': 0}}
  200 + b = bs4.BeautifulSoup(text, 'html.parser')
  201 + elem = b.find('div', class_='hidden_elem', string=re.compile(r'data-testid="friend_list_item"'))
  202 + if elem:
  203 + elem = bs4.BeautifulSoup(elem.string, 'html.parser')
  204 + divs = elem.find_all('div', {"data-testid": "friend_list_item"})
  205 + first_page = True
  206 + else:
  207 + first_page = False
  208 + divs = b.find_all('li')
  209 +
  210 + for dd in divs:
  211 + node = dict()
  212 + item = dict()
  213 + try:
  214 + a_data = dd.find('a').attrs
  215 + img_data = dd.find('img').attrs
  216 +
  217 + url = a_data.get('href')
  218 + fbid = re.findall(r'user.php\?id=(\d+)&', a_data.get('data-hovercard', ""))[0]
  219 + image = img_data.get('src')
  220 + name = img_data.get('aria-label')
  221 + item['image'] = {'uri': image}
  222 + item['title'] = {'text': name}
  223 + item['node'] = {'id': fbid, 'url': url}
  224 + node['node'] = item
  225 + data['items']['edges'].append(node)
  226 + except:
  227 + pass
  228 + if first_page:
  229 + script = b.find('script', string=re.compile('"MedleyPageletRequestData"'))
  230 + page = re.compile(r'"MedleyPageletRequestData","set",\[\],\[(.*?)\]')
  231 + ext_data = demjson.decode(page.search(script.string).group(1))
  232 +
  233 + load = re.compile('\["TimelineAppCollection","enableContentLoader".*?\]\]')
  234 + script_126 = b.find('script', string=load)
  235 + if script_126:
  236 + tttt = load.search(script_126.string).group()
  237 + ext_data.update({
  238 + 'cursor': re.search(r'"([A-Za-z0-9-_]{50,200})"', tttt).group(1),
  239 + 'collection_token': re.search(r'"pagelet_timeline_app_collection_(.*?)"', tttt).group(1),
  240 + })
  241 + data['items']['has_next_page'] = True
  242 + else:
  243 + data['items']['has_next_page'] = False
  244 +
  245 + data['items']['count'] = int(
  246 + re.search(r'AllFriendsAppCollectionPagelet".*?,tab_count:(\d+),', text).group(1))
  247 +
  248 + ext_data['count'] = data['items']['count']
  249 + data['items']['ext_data'] = ext_data
  250 + else:
  251 + data['items']['ext_data'] = "update"
  252 +
  253 + return data
  254 +
  255 +
  256 +def get_pagelet_info(res):
  257 + y = {}
  258 + for x in res['jsmods']['require']:
  259 + if isinstance(x, list) and x[0] == 'TimelineAppCollection':
  260 + y = x
  261 + break
  262 + try:
  263 + cursor = y[3][2]
  264 + except:
  265 + cursor = None
  266 + return cursor