作者 lemon

兼容新旧版本fb页面的好友列表采集

... ... @@ -6,7 +6,8 @@
# @Software: PyCharm
import datetime
import demjson
import base64
from enum import Enum
from attr import attrs, attrib
... ... @@ -106,7 +107,7 @@ class Exchange(Enum):
return None
def todict(obj,include:list=None):
def todict(obj, include: list = None):
keys = dir(obj)
res = {}
if include:
... ... @@ -123,3 +124,19 @@ def todict(obj,include:list=None):
value = getattr(obj, key)
res[key] = value
return res
def tobase64(obj):
if isinstance(obj, (dict, list)):
obj = demjson.encode(obj)
if isinstance(obj, str):
obj = obj.encode('utf-8')
if isinstance(obj, bytes):
bin = base64.b64encode(obj)
return str(bin, 'utf-8')
raise BaseException("must str,list,dict,bytes")
def frombase64(base):
string = base64.b64decode(base).decode()
return demjson.decode(string)
... ...
... ... @@ -17,7 +17,7 @@ import furl
from fbchat import Client, ThreadType, Message, Sticker, FBchatUserError, _exception, log, _util
from fbchat._state import State, session_factory, is_home
from lib import google_map
from lib import google_map, common
from lib.common import WorkPlace, College
from utils import parse_html, _attachment
... ... @@ -782,21 +782,39 @@ class FacebookClient(Client):
def friendsList(self, ext_data=None):
if ext_data:
data = {'av': self.uid,
'fb_api_caller_class': 'RelayModern',
'fb_api_req_friendly_name': 'ProfileCometAppCollectionListRendererPaginationQuery',
'variables': json.dumps({"count": 20,
"cursor": ext_data['cursor'],
"search": None, "scale": 2,
"privacySelectorRenderLocation": "COMET_PROFILE_COLLECTIONS",
"id": ext_data['id']}),
'doc_id': '2773917206008873'}
res = self._post('/api/graphql/', data)
res = res['data']['node']
ext_data = common.frombase64(ext_data)
if 'collection_token' in ext_data:
data = {
'fb_dtsg_ag': self._state.fb_dtsg_ag,
'data': demjson.encode(ext_data),
}
res = self._get('/ajax/pagelet/generic.php/AllFriendsAppCollectionPagelet', data)
cursor = parse_html.get_pagelet_info(res)
ext_data.update({'cursor': cursor})
res = parse_html.get_friend_div(res.get('payload'))
res['items']['ext_data'] = ext_data
res['items']['count'] = ext_data['count']
res['items']['has_next_page'] = not cursor is None
else:
data = {'av': self.uid,
'fb_api_caller_class': 'RelayModern',
'fb_api_req_friendly_name': 'ProfileCometAppCollectionListRendererPaginationQuery',
'variables': json.dumps({"count": 20,
"cursor": ext_data['cursor'],
"search": None, "scale": 2,
"privacySelectorRenderLocation": "COMET_PROFILE_COLLECTIONS",
"id": ext_data['id']}),
'doc_id': '2773917206008873'}
res = self._post('/api/graphql/', data)
res = res['data']['node']
else:
res = self._state._session.get(self._state.page_url + '/friends')
res = self._state._session.get(self._state.page_url.replace('profile.php?id=', '') + '/friends')
if res.status_code == 200 and '/friends' in res.url:
res = parse_html.get_frient_div(res.text)
res = parse_html.get_friend_div(res.text)
if not res:
respone = {
'items': [],
... ... @@ -820,15 +838,31 @@ class FacebookClient(Client):
}
friends.append(data)
respone = {
'items': friends,
'count': count,
'ext_data': {
'id': res['id'],
'cursor': res['items']['page_info']['end_cursor']
},
'has_next_page': res['items']['page_info']['has_next_page']
}
if 'page_info' in res['items']:
respone = {
'items': friends,
'count': count,
'ext_data': {
'id': res['id'],
'cursor': res['items']['page_info']['end_cursor']
},
'has_next_page': res['items']['page_info']['has_next_page']
}
elif 'ext_data' in res['items']:
respone = {
'items': friends,
'count': count,
'ext_data': res['items']['ext_data'],
'has_next_page': res['items']['has_next_page'],
}
else:
respone = {
'items': [],
'count': 0,
'ext_data': {},
'has_next_page': False
}
respone['ext_data'] = common.tobase64(respone['ext_data'])
return respone
def changePwd(self, old, new):
... ...
... ... @@ -130,22 +130,28 @@ def get_current_city(res):
def get_user_info(b):
pattern = re.compile(r"viewer_actor:(.*?)comment_count", re.MULTILINE | re.DOTALL)
script = b.find("script", text=pattern)
if script:
if not script:
a_lable = b.find('a', attrs={'data-gt': '{"chrome_nav_item":"timeline_chrome"}'})
name = a_lable.text
url = a_lable.attrs.get('href')
img_data = a_lable.find('img').attrs
image = img_data.get('src')
id = re.search("_header_(\d+)", a_lable.find('img').attrs.get('id')).group(1)
else:
info = pattern.search(script.text).group()
id = re.findall(r'id:"(.*?)"', info)[0]
name = re.findall(r',name:"(.*?)"', info)[0]
url = re.findall(r'url:"(.*?)"', info)[0]
image = re.findall(r'profile_picture_depth_0.*?uri:"(.*?)"', info)[0]
try:
r = re.compile('/p\d+x\d+/(.*?)\?')
iname = r.findall(image)[0]
pattern = re.compile('src="(https.*?/p\d{3,}x\d{3,}/%s.*?)"' % (iname), re.MULTILINE | re.DOTALL)
elem = b.find('div', class_='hidden_elem', string=pattern)
image = re.sub('&', lambda x: "&", pattern.findall(elem.string)[0])
except:
pass
return id, name, url, image
return None, None, None, None
try:
r = re.compile('/p\d+x\d+/(.*?)\?')
iname = r.findall(image)[0]
pattern = re.compile('src="(https.*?/p\d{3,}x\d{3,}/%s.*?)"' % (iname), re.MULTILINE | re.DOTALL)
elem = b.find('div', class_='hidden_elem', string=pattern)
image = re.sub('&', lambda x: "&", pattern.findall(elem.string)[0])
except:
pass
return id, name, url, image
def get_all_raw_id(text):
... ... @@ -182,11 +188,79 @@ def get_div_text(html):
return b.text
def get_frient_div(text):
if not 'ProfileCometAppSectionFriendsList' in text:
return None
b = bs4.BeautifulSoup(text, 'html.parser')
script = b.find('script', string=re.compile(r'ProfileCometAppSectionFriendsList'))
a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0]
res = demjson.decode(a)
return res
def get_friend_div(text):
if 'ProfileCometAppSectionFriendsList' in text:
b = bs4.BeautifulSoup(text, 'html.parser')
script = b.find('script', string=re.compile(r'ProfileCometAppSectionFriendsList'))
a = re.findall(r'TimelineAppCollectionListRenderer",collection:(\{.*?\}\}\})', script.string)[0]
res = demjson.decode(a)
return res
elif 'friend_list_item' in text:
data = {'items': {'edges': [], 'count': 0}}
b = bs4.BeautifulSoup(text, 'html.parser')
elem = b.find('div', class_='hidden_elem', string=re.compile(r'data-testid="friend_list_item"'))
if elem:
elem = bs4.BeautifulSoup(elem.string, 'html.parser')
divs = elem.find_all('div', {"data-testid": "friend_list_item"})
first_page = True
else:
first_page = False
divs = b.find_all('li')
for dd in divs:
node = dict()
item = dict()
try:
a_data = dd.find('a').attrs
img_data = dd.find('img').attrs
url = a_data.get('href')
fbid = re.findall(r'user.php\?id=(\d+)&', a_data.get('data-hovercard', ""))[0]
image = img_data.get('src')
name = img_data.get('aria-label')
item['image'] = {'uri': image}
item['title'] = {'text': name}
item['node'] = {'id': fbid, 'url': url}
node['node'] = item
data['items']['edges'].append(node)
except:
pass
if first_page:
script = b.find('script', string=re.compile('"MedleyPageletRequestData"'))
page = re.compile(r'"MedleyPageletRequestData","set",\[\],\[(.*?)\]')
ext_data = demjson.decode(page.search(script.string).group(1))
load = re.compile('\["TimelineAppCollection","enableContentLoader".*?\]\]')
script_126 = b.find('script', string=load)
if script_126:
tttt = load.search(script_126.string).group()
ext_data.update({
'cursor': re.search(r'"([A-Za-z0-9-_]{50,200})"', tttt).group(1),
'collection_token': re.search(r'"pagelet_timeline_app_collection_(.*?)"', tttt).group(1),
})
data['items']['has_next_page'] = True
else:
data['items']['has_next_page'] = False
data['items']['count'] = int(
re.search(r'AllFriendsAppCollectionPagelet".*?,tab_count:(\d+),', text).group(1))
ext_data['count'] = data['items']['count']
data['items']['ext_data'] = ext_data
else:
data['items']['ext_data'] = "update"
return data
def get_pagelet_info(res):
y = {}
for x in res['jsmods']['require']:
if isinstance(x, list) and x[0] == 'TimelineAppCollection':
y = x
break
try:
cursor = y[3][2]
except:
cursor = None
return cursor
... ...