From 1246ecf467916ea83b9d045f106c66dc8b78e56a Mon Sep 17 00:00:00 2001 From: educgf004 <111123123a11@qq.com> Date: Wed, 10 Jul 2024 10:12:38 +0800 Subject: [PATCH 1/2] ADD file via upload --- 1.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 1.cpp diff --git a/1.cpp b/1.cpp new file mode 100644 index 0000000..c7f6de5 --- /dev/null +++ b/1.cpp @@ -0,0 +1,48 @@ +#include + +// 函数原型声明 +void bubbleSort(int arr[], int n); + +int main() { + // 固定数组大小为3 + int n = 3; + int arr[3]; + + // 获取用户输入数组元素 + for (int i = 0; i < n; i++) { + scanf("%d", &arr[i]); + } + + // 输出排序前的数组 + printf("输出排序前的数组: "); + for (int i = 0; i < n; i++) { + printf("%d ", arr[i]); + } + printf("\n"); + + // 调用冒泡排序函数 + bubbleSort(arr, n); + + // 输出排序后的数组 + printf("输出排序后的数组: "); + for (int i = 0; i < n; i++) { + printf("%d ", arr[i]); + } + printf("\n"); + + return 0; +} + +// 冒泡排序函数 +void bubbleSort(int arr[], int n) { + for (int i = 0; i < n - 1; i++) { + for (int j = 0; j < n - i - 1; j++) { + // 如果相邻的元素逆序,则交换它们 + if (arr[j] > arr[j + 1]) { + int temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } + } + } +} -- 2.34.1 From c5e4cce03fa4782fe9e42a4669ed8c85a9b2d4cd Mon Sep 17 00:00:00 2001 From: educgf004 <111123123a11@qq.com> Date: Wed, 10 Jul 2024 10:12:57 +0800 Subject: [PATCH 2/2] ADD file via upload --- main.py | 381 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 381 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..65f670c --- /dev/null +++ b/main.py @@ -0,0 +1,381 @@ +import re +import time +from datetime import datetime +import httpx +import asyncio +import os +import json +from user_info import User_info +from csv_gen import csv_gen +from cache_gen import cache_gen + +max_concurrent_requests = 8 #最大并发数量,默认为8,对自己网络有自信的可以调高; 遇到多次下载失败时适当降低 + +def del_special_char(string): + string = re.sub(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3040-\u31FF\.]', '', string) + return string + +def stamp2time(msecs_stamp:int) -> str: + timeArray = time.localtime(msecs_stamp/1000) + otherStyleTime = time.strftime("%Y-%m-%d %H-%M", timeArray) + return otherStyleTime + +def time2stamp(timestr:str) -> int: + datetime_obj = datetime.strptime(timestr, "%Y-%m-%d") + msecs_stamp = int(time.mktime(datetime_obj.timetuple()) * 1000.0 + datetime_obj.microsecond / 1000.0) + return msecs_stamp + +def time_comparison(now, start, end): + start_label = True + start_down = False + #twitter : latest -> old + if now >= start and now <= end: #符合时间条件,下载 + start_down = True + elif now < start: #超出时间范围,结束 + start_label = False + return [start_down, start_label] + + +#读取配置 +log_output = False +has_retweet = False +has_highlights = False +has_likes = False +has_video = False +csv_file = None +cache_data = None +down_log = False +async_down = True +autoSync = False + +start_time_stamp = 655028357000 #1990-10-04 +end_time_stamp = 2548484357000 #2050-10-04 +start_label = True +First_Page = True #首页提取内容时特殊处理 + +with open('settings.json', 'r', encoding='utf8') as f: + settings = json.load(f) + if not settings['save_path']: + settings['save_path'] = os.getcwd() + settings['save_path'] += os.sep + if settings['has_retweet']: + has_retweet = True + if settings['high_lights']: + has_highlights = True + has_retweet = False + if settings['time_range']: + time_range = True + start_time,end_time = settings['time_range'].split(':') + start_time_stamp,end_time_stamp = time2stamp(start_time),time2stamp(end_time) + if settings['autoSync']: + autoSync = True + if settings['down_log']: + down_log = True + if settings['likes']: #likes的逻辑和retweet大致相同 + has_retweet = True + has_likes = True + has_highlights = False + start_time_stamp = 655028357000 #1990-10-04 + end_time_stamp = 2548484357000 #2050-10-04 + if settings['has_video']: + has_video = True + if settings['log_output']: + log_output = True + if not settings['async_down']: + async_down = False +###### proxy ###### + if settings['proxy']: + proxies = settings['proxy'] + else: + proxies = None +############ + img_format = settings['img_format'] + f.close() + +_headers = { + 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', + 'authorization':'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', +} +_headers['cookie'] = settings['cookie'] + +request_count = 0 #请求次数计数 +down_count = 0 #下载图片数计数 + +def get_other_info(_user_info): + url = 'https://twitter.com/i/api/graphql/xc8f1g7BYqr6VTzTbvNlGw/UserByScreenName?variables={"screen_name":"' + _user_info.screen_name + '","withSafetyModeUserFields":false}&features={"hidden_profile_likes_enabled":false,"hidden_profile_subscriptions_enabled":false,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"subscriptions_verification_info_verified_since_enabled":true,"highlights_tweets_tab_ui_enabled":true,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true}&fieldToggles={"withAuxiliaryUserLabels":false}' + try: + global request_count + response = httpx.get(url, headers=_headers, proxies=proxies).text + request_count += 1 + raw_data = json.loads(response) + _user_info.rest_id = raw_data['data']['user']['result']['rest_id'] + _user_info.name = raw_data['data']['user']['result']['legacy']['name'] + _user_info.statuses_count = raw_data['data']['user']['result']['legacy']['statuses_count'] + _user_info.media_count = raw_data['data']['user']['result']['legacy']['media_count'] + except Exception: + print('获取信息失败') + print(response) + return False + return True + +def print_info(_user_info): + print( + f''' + <======基本信息=====> + 昵称:{_user_info.name} + 用户名:{_user_info.screen_name} + 数字ID:{_user_info.rest_id} + 总推数(含转推):{_user_info.statuses_count} + 含图片/视频/音频推数(不含转推):{_user_info.media_count} + <==================> + 开始爬取... + ''' + ) + +def get_download_url(_user_info): + + def get_heighest_video_quality(variants) -> str: #找到最高质量的视频地址,并返回 + + if len(variants) == 1: #gif适配 + return variants[0]['url'] + + max_bitrate = 0 + heighest_url = None + for i in variants: + if 'bitrate' in i: + if int(i['bitrate']) > max_bitrate: + max_bitrate = int(i['bitrate']) + heighest_url = i['url'] + return heighest_url + + + def get_url_from_content(content): + global start_label + _photo_lst = [] + if has_retweet or has_highlights: + x_label = 'content' + else: + x_label = 'item' + for i in content: + try: + if 'promoted-tweet' in i['entryId']: #排除广告 + continue + if 'tweet' in i['entryId']: #正常推文 + if 'tweet' in i[x_label]['itemContent']['tweet_results']['result']: + a = i[x_label]['itemContent']['tweet_results']['result']['tweet']['legacy'] #适配限制回复账号 + frr = [a['favorite_count'], a['retweet_count'], a['reply_count']] + tweet_msecs = int(i[x_label]['itemContent']['tweet_results']['result']['tweet']['edit_control']['editable_until_msecs']) - 3600000 + else: + a = i[x_label]['itemContent']['tweet_results']['result']['legacy'] + frr = [a['favorite_count'], a['retweet_count'], a['reply_count']] + tweet_msecs = int(i[x_label]['itemContent']['tweet_results']['result']['edit_control']['editable_until_msecs']) - 3600000 + timestr = stamp2time(tweet_msecs) + + #我知道这边代码很烂 + #但我实在不想重构 ( º﹃º ) + + _result = time_comparison(tweet_msecs, start_time_stamp, end_time_stamp) + if _result[0]: #符合时间限制 + if 'extended_entities' in a and 'retweeted_status_result' not in a: + _photo_lst += [(get_heighest_video_quality(_media['video_info']['variants']), f'{timestr}-vid', [tweet_msecs, _user_info.name, f'@{_user_info.screen_name}', _media['expanded_url'], 'Video', get_heighest_video_quality(_media['video_info']['variants']), '', a['full_text']] + frr) if 'video_info' in _media and has_video else (_media['media_url_https'], f'{timestr}-img', [tweet_msecs, _user_info.name, f'@{_user_info.screen_name}', _media['expanded_url'], 'Image', _media['media_url_https'], '', a['full_text']] + frr) for _media in a['extended_entities']['media']] + + + elif 'retweeted_status_result' in a and 'extended_entities' in a['retweeted_status_result']['result']['legacy']: #判断是否为转推,以及是否获取转推 + _photo_lst += [(get_heighest_video_quality(_media['video_info']['variants']), f'{timestr}-vid-retweet', [tweet_msecs, a['retweeted_status_result']['result']['core']['user_results']['result']['legacy']['name'], f"@{a['retweeted_status_result']['result']['core']['user_results']['result']['legacy']['screen_name']}", _media['expanded_url'], 'Video', get_heighest_video_quality(_media['video_info']['variants']), '', a['retweeted_status_result']['result']['legacy']['full_text']] + frr) if 'video_info' in _media and has_video else (_media['media_url_https'], f'{timestr}-img-retweet', [tweet_msecs, a['retweeted_status_result']['result']['core']['user_results']['result']['legacy']['name'], f"@{a['retweeted_status_result']['result']['core']['user_results']['result']['legacy']['screen_name']}", _media['expanded_url'], 'Image', _media['media_url_https'], '', a['retweeted_status_result']['result']['legacy']['full_text']] + frr) for _media in a['retweeted_status_result']['result']['legacy']['extended_entities']['media']] + elif not _result[1]: #已超出目标时间范围 + start_label = False + break + + elif 'profile-conversation' in i['entryId']: #回复的推文(对话线索) + if 'tweet' in i[x_label]['items'][0]['item']['itemContent']['tweet_results']['result']: + a = i[x_label]['items'][0]['item']['itemContent']['tweet_results']['result']['tweet']['legacy'] + frr = [a['favorite_count'], a['retweet_count'], a['reply_count']] + tweet_msecs = int(i[x_label]['items'][0]['item']['itemContent']['tweet_results']['result']['tweet']['edit_control']['editable_until_msecs']) - 3600000 + else: + a = i[x_label]['items'][0]['item']['itemContent']['tweet_results']['result']['legacy'] + frr = [a['favorite_count'], a['retweet_count'], a['reply_count']] + tweet_msecs = int(i[x_label]['items'][0]['item']['itemContent']['tweet_results']['result']['edit_control']['editable_until_msecs']) - 3600000 + timestr = stamp2time(tweet_msecs) + + _result = time_comparison(tweet_msecs, start_time_stamp, end_time_stamp) + if _result[0]: #符合时间限制 + if 'extended_entities' in a: + _photo_lst += [(get_heighest_video_quality(_media['video_info']['variants']), f'{timestr}-vid', [tweet_msecs, _user_info.name, f'@{_user_info.screen_name}', _media['expanded_url'], 'Video', get_heighest_video_quality(_media['video_info']['variants']), '', a['full_text']] + frr) if 'video_info' in _media and has_video else (_media['media_url_https'], f'{timestr}-img', [tweet_msecs, _user_info.name, f'@{_user_info.screen_name}', _media['expanded_url'], 'Image', _media['media_url_https'], '', a['full_text']] + frr) for _media in a['extended_entities']['media']] + elif not _result[1]: #已超出目标时间范围 + start_label = False + break + + except Exception as e: + continue + if 'cursor-bottom' in i['entryId']: #更新下一页的请求编号(含转推模式&亮点模式) + _user_info.cursor = i['content']['value'] + + return _photo_lst + + print(f'已下载图片/视频:{_user_info.count}') + if has_highlights: ##2024-01-05 #适配[亮点]标签 + url_top = 'https://twitter.com/i/api/graphql/w9-i9VNm_92GYFaiyGT1NA/UserHighlightsTweets?variables={"userId":"' + _user_info.rest_id + '","count":20,' + url_bottom = '"includePromotedContent":true,"withVoice":true}&features={"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"c9s_tweet_anatomy_moderator_badge_enabled":true,"tweetypie_unmention_optimization_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":false,"tweet_awards_web_tipping_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_media_download_video_enabled":false,"responsive_web_enhance_cards_enabled":false}' + elif has_likes: + url_top = 'https://twitter.com/i/api/graphql/-fbTO1rKPa3nO6-XIRgEFQ/Likes?variables={"userId":"' + _user_info.rest_id + '","count":200,' + url_bottom = '"includePromotedContent":false,"withClientEventToken":false,"withBirdwatchNotes":false,"withVoice":true,"withV2Timeline":true}&features={"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"c9s_tweet_anatomy_moderator_badge_enabled":true,"tweetypie_unmention_optimization_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":false,"tweet_awards_web_tipping_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_media_download_video_enabled":false,"responsive_web_enhance_cards_enabled":false}' + elif has_retweet: #包含转推调用[UserTweets]的API(调用一次上限返回20条) + url_top = 'https://twitter.com/i/api/graphql/2GIWTr7XwadIixZDtyXd4A/UserTweets?variables={"userId":"' + _user_info.rest_id + '","count":20,' + url_bottom = '"includePromotedContent":false,"withQuickPromoteEligibilityTweetFields":true,"withVoice":true,"withV2Timeline":true}&features={"rweb_lists_timeline_redesign_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"tweetypie_unmention_optimization_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":false,"tweet_awards_web_tipping_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_media_download_video_enabled":false,"responsive_web_enhance_cards_enabled":false}&fieldToggles={"withAuxiliaryUserLabels":false,"withArticleRichContentState":false}' + else: #不包含转推则调用[UserMedia]的API(返回条数貌似无上限/改count) ##2023-12-11#此模式API返回值变动 + url_top = 'https://twitter.com/i/api/graphql/Le6KlbilFmSu-5VltFND-Q/UserMedia?variables={"userId":"' + _user_info.rest_id + '","count":500,' + url_bottom = '"includePromotedContent":false,"withClientEventToken":false,"withBirdwatchNotes":false,"withVoice":true,"withV2Timeline":true}&features={"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"tweetypie_unmention_optimization_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":false,"tweet_awards_web_tipping_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_media_download_video_enabled":false,"responsive_web_enhance_cards_enabled":false}' + + if _user_info.cursor: + url = url_top + '"cursor":"' + _user_info.cursor + '",' + url_bottom + else: + url = url_top + url_bottom #第一页,无cursor + try: + global request_count + response = httpx.get(url, headers=_headers, proxies=proxies).text + request_count += 1 + raw_data = json.loads(response) + if has_highlights: #亮点模式 + raw_data = raw_data['data']['user']['result']['timeline']['timeline']['instructions'][-1]['entries'] + elif has_retweet: #与likes共用 + raw_data = raw_data['data']['user']['result']['timeline_v2']['timeline']['instructions'][-1]['entries'] + else: #usermedia模式 + raw_data = raw_data['data']['user']['result']['timeline_v2']['timeline']['instructions'] + if (has_retweet or has_highlights) and 'cursor-top' in raw_data[0]['entryId']: #含转推模式 所有推文已全部下载完成 + return False + + if not has_retweet and not has_highlights: #usermedia模式下的下一页请求编号 + for i in raw_data[-1]['entries']: + if 'bottom' in i['entryId']: + _user_info.cursor = i['content']['value'] + # _user_info.cursor = raw_data[-1]['entries'][0]['content']['value'] + + if start_label: #判断是否超出时间范围 + if not has_retweet and not has_highlights: + global First_Page + if First_Page: #第一页的返回值需特殊处理 + raw_data = raw_data[-1]['entries'][0]['content']['items'] + First_Page = False + else: + if 'moduleItems' not in raw_data[0]: #usermedia新模式,所有推文已全部下载完成 + return False + else: + raw_data = raw_data[0]['moduleItems'] + photo_lst = get_url_from_content(raw_data) + else: + return False + + if not photo_lst: + photo_lst.append(True) + except Exception as e: + print('获取推文信息错误') + print(e) + print(response) + return False + return photo_lst + +def download_control(_user_info): + async def _main(): + async def down_save(url, prefix, csv_info, order: int): + if '.mp4' in url: + _file_name = f'{_user_info.save_path + os.sep}{prefix}_{_user_info.count + order}.mp4' + else: + try: + _file_name = f'{_user_info.save_path + os.sep}{prefix}_{_user_info.count + order}.{img_format}' + url += f'?format={img_format}&name=4096x4096' + except Exception as e: + print(url) + return False + count = 0 + while True: + try: + async with semaphore: + async with httpx.AsyncClient(proxies=proxies) as client: + global down_count + response = await client.get(url, timeout=(3.05, 16)) #如果出现第五次或以上的下载失败,且确认不是网络问题,可以适当降低最大并发数量 + down_count += 1 + with open(_file_name,'wb') as f: + f.write(response.content) + + if not has_likes: #非likes模式 + csv_info[-5] = os.path.split(_file_name)[1] + csv_file.data_input(csv_info) + + if log_output: + print(f'{_file_name}=====>下载完成') + + break + except Exception as e: + count += 1 + print(f'{_file_name}=====>第{count}次下载失败,正在重试(多次失败时请降低main.py第11行-异步模式)') + print(url) + + while True: + photo_lst = get_download_url(_user_info) + if not photo_lst: + break + elif photo_lst[0] == True: + continue + if async_down: + semaphore = asyncio.Semaphore(max_concurrent_requests) #最大并发数量,默认为8,对自己网络有自信的可以调高 + if down_log: + await asyncio.gather(*[asyncio.create_task(down_save(url[0], url[1], url[2], order)) for order,url in enumerate(photo_lst) if cache_data.is_present(url[0])]) + else: + await asyncio.gather(*[asyncio.create_task(down_save(url[0], url[1], url[2], order)) for order,url in enumerate(photo_lst)]) + else: + for order,url in enumerate(photo_lst): + await down_save(url[0], url[1], url[2], order) + _user_info.count += len(photo_lst) #更新计数 + + asyncio.run(_main()) + +def main(_user_info: object): + re_token = 'ct0=(.*?);' + _headers['x-csrf-token'] = re.findall(re_token,_headers['cookie'])[0] + _headers['referer'] = 'https://twitter.com/' + _user_info.screen_name + if not get_other_info(_user_info): + return False + print_info(_user_info) + _path = settings['save_path'] +_user_info.name + "@"+ _user_info.screen_name + if not os.path.exists(_path): #创建文件夹 + os.makedirs(settings['save_path'] +_user_info.name + "@"+ _user_info.screen_name) #用户名建文件夹 + _user_info.save_path = settings['save_path'] +_user_info.name + "@"+ _user_info.screen_name + else: + _user_info.save_path = _path + + if not has_likes: + global csv_file + csv_file = csv_gen(_user_info.save_path, _user_info.name, _user_info.screen_name, settings['time_range']) + + if down_log: + global cache_data + cache_data = cache_gen(_user_info.save_path) + + if autoSync: + files = sorted(os.listdir(_user_info.save_path)) + if len(files) > 0: + global start_time_stamp + re_rule = r'\d{4}-\d{2}-\d{2}' + for i in files[::-1]: + if "-img_" in i: + start_time_stamp = time2stamp(re.findall(re_rule, i)[0]) + break + elif "-vid_" in i: + start_time_stamp = time2stamp(re.findall(re_rule, i)[0]) + break + + download_control(_user_info) + + if not has_likes: + csv_file.csv_close() + if down_log: + del cache_data + print(f'{_user_info.name}下载完成\n\n') + +if __name__=='__main__': + _start = time.time() + for i in settings['user_lst'].split(','): + main(User_info(i)) + start_label = True + First_Page = True + print(f'共耗时:{time.time()-_start}秒\n共调用{request_count}次API\n共下载{down_count}份图片/视频') \ No newline at end of file -- 2.34.1