From eec7e72ce027b700e070d99d4a607250f571b9b0 Mon Sep 17 00:00:00 2001 From: Lightling Date: Fri, 9 Feb 2024 22:28:41 -0500 Subject: [PATCH] handle /search API after running /media requests --- lib/dl.js | 54 +++++++++++++++++++++++++++++++++++++++++------ run-downloadDb.js | 9 +++++--- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/lib/dl.js b/lib/dl.js index 36a68bb..feb2cd3 100644 --- a/lib/dl.js +++ b/lib/dl.js @@ -4,21 +4,44 @@ import { getArg } from './args.js'; import { error, log } from './log.js'; const ctx = 'getUser.js'; +const loggedDateRegex = new RegExp('[a-zA-Z0-9]+\-[0-9]+\-([0-9]{4})([0-9]{2})([0-9]{2})_([0-9]{2})([0-9]{2})([0-9]{2})', 'gm'); + +/** + * Gets a date formatted as yyyy-mm-dd for use with `/search` + * from the last logged date (which is assumed to be the oldest retrieved image from a previous run) + * @param {string[]} logs + */ +const getDateUrlFromLog = (logs) => { + // sometimes the logs get grouped into one single string, while others get individually logged + // may as well flatten it to a single string and test with regex to make consistent and avoid bugs + const flat = logs.flat().join('\n'); + let result = loggedDateRegex.exec(flat), last = loggedDateRegex.exec(flat); + while (!!last) { + result = last; + last = loggedDateRegex.exec(flat); + } + let date = new Date(result[1], result[2], result[3], result[4], result[5], result[6], 0); + // to be safe, spring forward a day + date.setDate(date.getDate() + 1); + + return `${date.getUTCFullYear()}-${date.getUTCMonth()}-${date.getUTCDate()}`; +}; /** * Runs {@link getUser} concurrently for many users * @param {{ user: string, logs: string[] }[]} userDb userDb to run {@link getUser} on * @param {number} threadMax max number of threads to run concurrently * @param {string} directory the directory to save the user media folders in + * @param {'media' | 'search'} mode whether to retrieve from the `/media` route or `/search` API * @returns {Promise} promise which resolves once all threads for every user have finished */ -export const getMany = (userDb, threadMax, directory) => new Promise((resolve, reject) => { +export const getMany = (userDb, threadMax, directory, mode) => new Promise((resolve, reject) => { let running = 0; let index = 0; const get = () => { const onFinish = (currentIndex) => { - log(ctx, `Finished ${userDb[currentIndex].user}/media`); + log(ctx, `Finished ${userDb[currentIndex].user} under ${mode} mode`); --running; get(); } @@ -26,8 +49,12 @@ export const getMany = (userDb, threadMax, directory) => new Promise((resolve, r while (running < threadMax && index < userDb.length) { ++running; let currentIndex = index++; + const modeParams = mode === 'media' ? 'media' : { + from: '2007-12-31', + to: getDateUrlFromLog(userDb[currentIndex].logs), + }; - let proc = getUser(userDb[currentIndex].user, directory); + let proc = getUser(userDb[currentIndex].user, directory, modeParams); proc.stdout.on('data', data => { userDb[currentIndex].logs.push(data); }); @@ -47,10 +74,23 @@ export const getMany = (userDb, threadMax, directory) => new Promise((resolve, r * Retrieves gallery for specified user and saves to the specified parent path * @param {string} user the user to retrieve media from * @param {string} path the path to save the user's media folder in + * @param {'media' | { from: string, to: string }} modeParams depending on the mode: + * - if `'media'`, will retrieve from /media + * - if object, will retrieve from /search?f=live&src=typed_query&q=(from%3A``)%20since%3A%20until%3A`` * @returns {ChildProcess} the process that was {@link spawn spawned} */ -export const getUser = (user, path) => { - const url = `https://twitter.com/${user}/media`; +export const getUser = (user, path, modeParams) => { + //"https://twitter.com/search?f=live&q=(from%3Aad_varg)%20until%3A2023-06-30%20since%3A2007-01-01&src=typed_query" + let url = `https://twitter.com/${user}/media`; + if (modeParams !== 'media') { + url = `https://twitter.com/search?f=live&src=typed_query&q=(from%3A${user})`; + if (!!modeParams.from) { + url += `%20since%3A${modeParams.from}`; + } + if (!!modeParams.to) { + url += `%20until%3A${modeParams.to}`; + } + } let args; try { args = getArg('args'); @@ -58,8 +98,8 @@ export const getUser = (user, path) => { log(ctx, 'No args being provided to gallery-dl'); } - log(ctx, `python3 ~/.local/bin/gallery-dl -c ./config.json${!!args ? ' ' + args + ' ' : ' '}-d ${path} ${url}`); - const proc = spawn(`python3 ~/.local/bin/gallery-dl -c ./config.json${!!args ? ' ' + args + ' ' : ' '}-d ${path} ${url}`, { shell: true }); + log(ctx, `python3 ~/.local/bin/gallery-dl -c ./config.json${!!args ? ' ' + args + ' ' : ' '}-d ${path} "${url}"`); + const proc = spawn(`python3 ~/.local/bin/gallery-dl -c ./config.json${!!args ? ' ' + args + ' ' : ' '}-d ${path} "${url}"`, { shell: true }); proc.stdout.on('data', data => { log(ctx, data); diff --git a/run-downloadDb.js b/run-downloadDb.js index ccacb11..4583a76 100644 --- a/run-downloadDb.js +++ b/run-downloadDb.js @@ -39,10 +39,13 @@ const downloadDb = async () => { logs: [], })); - log(ctx, `Building db using /media for ${processes.length} users`); - await getMany(processes, threadMax, directory); + log(ctx, `Downloading media using /media for ${processes.length} users`); + await getMany(processes, threadMax, directory, 'media'); - log(ctx, 'Building db using /search'); + log(ctx, 'Downloading media using /search'); + await getMany(processes, threadMax, directory, 'search'); + + log(ctx, 'Done'); } downloadDb();