作者 lemon

兼容新旧版本fb页面的好友列表采集

@@ -6,7 +6,8 @@ @@ -6,7 +6,8 @@
6 # @Software: PyCharm 6 # @Software: PyCharm
7 7
8 import datetime 8 import datetime
9 - 9 +import demjson
  10 +import base64
10 from enum import Enum 11 from enum import Enum
11 from attr import attrs, attrib 12 from attr import attrs, attrib
12 13
@@ -106,7 +107,7 @@ class Exchange(Enum): @@ -106,7 +107,7 @@ class Exchange(Enum):
106 return None 107 return None
107 108
108 109
109 -def todict(obj,include:list=None): 110 +def todict(obj, include: list = None):
110 keys = dir(obj) 111 keys = dir(obj)
111 res = {} 112 res = {}
112 if include: 113 if include:
@@ -123,3 +124,19 @@ def todict(obj,include:list=None): @@ -123,3 +124,19 @@ def todict(obj,include:list=None):
123 value = getattr(obj, key) 124 value = getattr(obj, key)
124 res[key] = value 125 res[key] = value
125 return res 126 return res
  127 +
  128 +
  129 +def tobase64(obj):
  130 + if isinstance(obj, (dict, list)):
  131 + obj = demjson.encode(obj)
  132 + if isinstance(obj, str):
  133 + obj = obj.encode('utf-8')
  134 + if isinstance(obj, bytes):
  135 + bin = base64.b64encode(obj)
  136 + return str(bin, 'utf-8')
  137 + raise BaseException("must str,list,dict,bytes")
  138 +
  139 +
  140 +def frombase64(base):
  141 + string = base64.b64decode(base).decode()
  142 + return demjson.decode(string)
@@ -17,7 +17,7 @@ import furl @@ -17,7 +17,7 @@ import furl
17 from fbchat import Client, ThreadType, Message, Sticker, FBchatUserError, _exception, log, _util 17 from fbchat import Client, ThreadType, Message, Sticker, FBchatUserError, _exception, log, _util
18 from fbchat._state import State, session_factory, is_home 18 from fbchat._state import State, session_factory, is_home
19 19
20 -from lib import google_map 20 +from lib import google_map, common
21 from lib.common import WorkPlace, College 21 from lib.common import WorkPlace, College
22 from utils import parse_html, _attachment 22 from utils import parse_html, _attachment
23 23
@@ -782,21 +782,39 @@ class FacebookClient(Client): @@ -782,21 +782,39 @@ class FacebookClient(Client):
782 782
783 def friendsList(self, ext_data=None): 783 def friendsList(self, ext_data=None):
784 if ext_data: 784 if ext_data:
785 - data = {'av': self.uid,  
786 - 'fb_api_caller_class': 'RelayModern',  
787 - 'fb_api_req_friendly_name': 'ProfileCometAppCollectionListRendererPaginationQuery',  
788 - 'variables': json.dumps({"count": 20,  
789 - "cursor": ext_data['cursor'],  
790 - "search": None, "scale": 2,  
791 - "privacySelectorRenderLocation": "COMET_PROFILE_COLLECTIONS",  
792 - "id": ext_data['id']}),  
793 - 'doc_id': '2773917206008873'}  
794 - res = self._post('/api/graphql/', data)  
795 - res = res['data']['node'] 785 + ext_data = common.frombase64(ext_data)
  786 +
  787 + if 'collection_token' in ext_data:
  788 + data = {
  789 + 'fb_dtsg_ag': self._state.fb_dtsg_ag,
  790 + 'data': demjson.encode(ext_data),
  791 + }
  792 + res = self._get('/ajax/pagelet/generic.php/AllFriendsAppCollectionPagelet', data)
  793 +
  794 + cursor = parse_html.get_pagelet_info(res)
  795 + ext_data.update({'cursor': cursor})
  796 +
  797 + res = parse_html.get_friend_div(res.get('payload'))
  798 + res['items']['ext_data'] = ext_data
  799 + res['items']['count'] = ext_data['count']
  800 + res['items']['has_next_page'] = not cursor is None
  801 + else:
  802 + data = {'av': self.uid,
  803 + 'fb_api_caller_class': 'RelayModern',
  804 + 'fb_api_req_friendly_name': 'ProfileCometAppCollectionListRendererPaginationQuery',
  805 + 'variables': json.dumps({"count": 20,
  806 + "cursor": ext_data['cursor'],
  807 + "search": None, "scale": 2,
  808 + "privacySelectorRenderLocation": "COMET_PROFILE_COLLECTIONS",
  809 + "id": ext_data['id']}),
  810 + 'doc_id': '2773917206008873'}
  811 + res = self._post('/api/graphql/', data)
  812 + res = res['data']['node']
  813 +
796 else: 814 else:
797 - res = self._state._session.get(self._state.page_url + '/friends') 815 + res = self._state._session.get(self._state.page_url.replace('profile.php?id=', '') + '/friends')
798 if res.status_code == 200 and '/friends' in res.url: 816 if res.status_code == 200 and '/friends' in res.url:
799 - res = parse_html.get_frient_div(res.text) 817 + res = parse_html.get_friend_div(res.text)
800 if not res: 818 if not res:
801 respone = { 819 respone = {
802 'items': [], 820 'items': [],
@@ -820,15 +838,31 @@ class FacebookClient(Client): @@ -820,15 +838,31 @@ class FacebookClient(Client):
820 } 838 }
821 friends.append(data) 839 friends.append(data)
822 840
823 - respone = {  
824 - 'items': friends,  
825 - 'count': count,  
826 - 'ext_data': {  
827 - 'id': res['id'],  
828 - 'cursor': res['items']['page_info']['end_cursor']  
829 - },  
830 - 'has_next_page': res['items']['page_info']['has_next_page']  
831 - } 841 + if 'page_info' in res['items']:
  842 + respone = {
  843 + 'items': friends,
  844 + 'count': count,
  845 + 'ext_data': {
  846 + 'id': res['id'],
  847 + 'cursor': res['items']['page_info']['end_cursor']
  848 + },
  849 + 'has_next_page': res['items']['page_info']['has_next_page']
  850 + }
  851 + elif 'ext_data' in res['items']:
  852 + respone = {
  853 + 'items': friends,
  854 + 'count': count,
  855 + 'ext_data': res['items']['ext_data'],
  856 + 'has_next_page': res['items']['has_next_page'],
  857 + }
  858 + else:
  859 + respone = {
  860 + 'items': [],
  861 + 'count': 0,
  862 + 'ext_data': {},
  863 + 'has_next_page': False
  864 + }
  865 + respone['ext_data'] = common.tobase64(respone['ext_data'])
832 return respone 866 return respone
833 867
834 def changePwd(self, old, new): 868 def changePwd(self, old, new):
@@ -130,22 +130,28 @@ def get_current_city(res): @@ -130,22 +130,28 @@ def get_current_city(res):
130 def get_user_info(b): 130 def get_user_info(b):
131 pattern = re.compile(r"viewer_actor:(.*?)comment_count", re.MULTILINE | re.DOTALL) 131 pattern = re.compile(r"viewer_actor:(.*?)comment_count", re.MULTILINE | re.DOTALL)
132 script = b.find("script", text=pattern) 132 script = b.find("script", text=pattern)
133 - if script: 133 + if not script:
  134 + a_lable = b.find('a', attrs={'data-gt': '{"chrome_nav_item":"timeline_chrome"}'})
  135 + name = a_lable.text
  136 + url = a_lable.attrs.get('href')
  137 + img_data = a_lable.find('img').attrs
  138 + image = img_data.get('src')
  139 + id = re.search("_header_(\d+)", a_lable.find('img').attrs.get('id')).group(1)
  140 + else:
134 info = pattern.search(script.text).group() 141 info = pattern.search(script.text).group()
135 id = re.findall(r'id:"(.*?)"', info)[0] 142 id = re.findall(r'id:"(.*?)"', info)[0]
136 name = re.findall(r',name:"(.*?)"', info)[0] 143 name = re.findall(r',name:"(.*?)"', info)[0]
137 url = re.findall(r'url:"(.*?)"', info)[0] 144 url = re.findall(r'url:"(.*?)"', info)[0]
138 image = re.findall(r'profile_picture_depth_0.*?uri:"(.*?)"', info)[0] 145 image = re.findall(r'profile_picture_depth_0.*?uri:"(.*?)"', info)[0]
139 - try:  
140 - r = re.compile('/p\d+x\d+/(.*?)\?')  
141 - iname = r.findall(image)[0]  
142 - pattern = re.compile('src="(https.*?/p\d{3,}x\d{3,}/%s.*?)"' % (iname), re.MULTILINE | re.DOTALL)  
143 - elem = b.find('div', class_='hidden_elem', string=pattern)  
144 - image = re.sub('&', lambda x: "&", pattern.findall(elem.string)[0])  
145 - except:  
146 - pass  
147 - return id, name, url, image  
148 - return None, None, None, None 146 + try:
  147 + r = re.compile('/p\d+x\d+/(.*?)\?')
  148 + iname = r.findall(image)[0]
  149 + pattern = re.compile('src="(https.*?/p\d{3,}x\d{3,}/%s.*?)"' % (iname), re.MULTILINE | re.DOTALL)
  150 + elem = b.find('div', class_='hidden_elem', string=pattern)
  151 + image = re.sub('&', lambda x: "&", pattern.findall(elem.string)[0])
  152 + except:
  153 + pass
  154 + return id, name, url, image
149 155
150 156
151 def get_all_raw_id(text): 157 def get_all_raw_id(text):
@@ -182,11 +188,79 @@ def get_div_text(html): @@ -182,11 +188,79 @@ def get_div_text(html):
182 return b.text 188 return b.text
183 189
184 190
185 -def get_frient_div(text):  
186 - if not 'ProfileCometAppSectionFriendsList' in text:  
187 - return None  
188 - b = bs4.BeautifulSoup(text, 'html.parser')  
189 - script = b.find('script', string=re.compile(r'ProfileCometAppSectionFriendsList'))  
190 - a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0]  
191 - res = demjson.decode(a)  
192 - return res 191 +def get_friend_div(text):
  192 + if 'ProfileCometAppSectionFriendsList' in text:
  193 + b = bs4.BeautifulSoup(text, 'html.parser')
  194 + script = b.find('script', string=re.compile(r'ProfileCometAppSectionFriendsList'))
  195 + a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0]
  196 + res = demjson.decode(a)
  197 + return res
  198 + elif 'friend_list_item' in text:
  199 + data = {'items': {'edges': [], 'count': 0}}
  200 + b = bs4.BeautifulSoup(text, 'html.parser')
  201 + elem = b.find('div', class_='hidden_elem', string=re.compile(r'data-testid="friend_list_item"'))
  202 + if elem:
  203 + elem = bs4.BeautifulSoup(elem.string, 'html.parser')
  204 + divs = elem.find_all('div', {"data-testid": "friend_list_item"})
  205 + first_page = True
  206 + else:
  207 + first_page = False
  208 + divs = b.find_all('li')
  209 +
  210 + for dd in divs:
  211 + node = dict()
  212 + item = dict()
  213 + try:
  214 + a_data = dd.find('a').attrs
  215 + img_data = dd.find('img').attrs
  216 +
  217 + url = a_data.get('href')
  218 + fbid = re.findall(r'user.php\?id=(\d+)&', a_data.get('data-hovercard', ""))[0]
  219 + image = img_data.get('src')
  220 + name = img_data.get('aria-label')
  221 + item['image'] = {'uri': image}
  222 + item['title'] = {'text': name}
  223 + item['node'] = {'id': fbid, 'url': url}
  224 + node['node'] = item
  225 + data['items']['edges'].append(node)
  226 + except:
  227 + pass
  228 + if first_page:
  229 + script = b.find('script', string=re.compile('"MedleyPageletRequestData"'))
  230 + page = re.compile(r'"MedleyPageletRequestData","set",\[\],\[(.*?)\]')
  231 + ext_data = demjson.decode(page.search(script.string).group(1))
  232 +
  233 + load = re.compile('\["TimelineAppCollection","enableContentLoader".*?\]\]')
  234 + script_126 = b.find('script', string=load)
  235 + if script_126:
  236 + tttt = load.search(script_126.string).group()
  237 + ext_data.update({
  238 + 'cursor': re.search(r'"([A-Za-z0-9-_]{50,200})"', tttt).group(1),
  239 + 'collection_token': re.search(r'"pagelet_timeline_app_collection_(.*?)"', tttt).group(1),
  240 + })
  241 + data['items']['has_next_page'] = True
  242 + else:
  243 + data['items']['has_next_page'] = False
  244 +
  245 + data['items']['count'] = int(
  246 + re.search(r'AllFriendsAppCollectionPagelet".*?,tab_count:(\d+),', text).group(1))
  247 +
  248 + ext_data['count'] = data['items']['count']
  249 + data['items']['ext_data'] = ext_data
  250 + else:
  251 + data['items']['ext_data'] = "update"
  252 +
  253 + return data
  254 +
  255 +
  256 +def get_pagelet_info(res):
  257 + y = {}
  258 + for x in res['jsmods']['require']:
  259 + if isinstance(x, list) and x[0] == 'TimelineAppCollection':
  260 + y = x
  261 + break
  262 + try:
  263 + cursor = y[3][2]
  264 + except:
  265 + cursor = None
  266 + return cursor