diff --git a/config.json b/config.json index 07259fa..8de61c0 100644 --- a/config.json +++ b/config.json @@ -6,9 +6,9 @@ "{author['name']}" ], "filename": { - "locals().get('bitrate', 0) > 0": "{author['name']}-{tweet_id}-{date:Olocal/%Y%m%d_%H%M%S}-vid{num}.{extension}", - "locals().get('bitrate') == 0": "{author['name']}-{tweet_id}-{date:Olocal/%Y%m%d_%H%M%S}-gif{num}.{extension}", - "": "{author['name']}-{tweet_id}-{date:Olocal/%Y%m%d_%H%M%S}-img{num}.{extension}" + "locals().get('bitrate', 0) > 0": "{author['name']}-{tweet_id}-{date:%Y%m%d_%H%M%S}-vid{num}.{extension}", + "locals().get('bitrate') == 0": "{author['name']}-{tweet_id}-{date:%Y%m%d_%H%M%S}-gif{num}.{extension}", + "": "{author['name']}-{tweet_id}-{date:%Y%m%d_%H%M%S}-img{num}.{extension}" }, "cookies": "$HOME/.config/gallery-dl/twitter-cookies.txt", "timeline": { @@ -18,7 +18,7 @@ { "name": "metadata", "event": "post", - "filename": "{author[name]}_{tweet_id}-{date:Olocal/%Y%m%d_%H%M%S}.json" + "filename": "{author['name']}-{tweet_id}-{date:%Y%m%d_%H%M%S}.json" } ] } diff --git a/run-dedupeIds.js b/run-dedupeIds.js new file mode 100644 index 0000000..509dbf5 --- /dev/null +++ b/run-dedupeIds.js @@ -0,0 +1,94 @@ +import { mkdir, readdir, rename } from 'fs/promises'; + +import { getArg } from './lib/args.js'; +import { getChildDirectories } from './lib/io.js'; +import { error, log } from './lib/log.js'; + +const ctx = 'dedupeIds.js'; +const idDateTimeRegex = new RegExp('([0-9]*)-(([0-9]{4})([0-9]{2})([0-9]{2}))_(([0-9]{2})([0-9]{2})([0-9]{2}))-((img|vid|gif)([0-9]*))'); + +/** + * Reads through the db directory and looks for duplicate entries by id. + * + * If they have the same id, it will use the provided offset to determine which time is correct. + * - If `--offset="-"`, it will mark the file with the older date as a duplicate. + * - If `--offset="+"`, it will mark the file with the newer date as a duplicate. + * + * This exists because `twittermediadownloader` used local time when saving files, which would possibly not be fixed because of DST. + * This repo uses UTC when saving times for the sake of consistency. + */ +const dedupeIds = async () => { + let directory, offset; + try { + directory = getArg('path'); + offset = getArg('offset'); + } catch (err) { + error(ctx, err); + return; + } + + log(ctx, 'Reading directories'); + const userDirs = await getChildDirectories(directory); + + userDirs.forEach(async user => { + let workingDir = `${directory}/${user}`; + try { + await mkdir(`${workingDir}/dups`); + } catch (err) { + if (!err.toString().includes('EEXIST')) { + error(ctx, err); + } + } + + log(ctx, `Finding dups for ${user}`); + const files = await readdir(workingDir); + const dupsToMove = []; + + for (let i = 0; i < files.length; ++i) { + // Skip when the current file is already in dupsToMove + if (dupsToMove.includes(files[i])) { + continue; + } + + // Go back/fourth ~5 images, as the max per post is 4; this should cover enough files without too much looping + let neighborMin = Math.min(Math.max(i - 5, 0), files.length - 1); + let neighborMax = Math.min(Math.max(i + 5, 0), files.length - 1); + for (let neighbor = neighborMin; neighbor < neighborMax; ++neighbor) { + if (dupsToMove.includes(files[neighbor]) || neighbor === i) { + continue; + } + + // Test the neighbor and self for regex pattern id-date_time-typeCount.extension + let neighborTest = idDateTimeRegex.exec(files[neighbor]); + let selfTest = idDateTimeRegex.exec(files[i]); + if (!!neighborTest && !!selfTest) { + // If the id and the typeCount (e.g. img1 === img1) are the same, then the file is a dup + if (neighborTest[1] === selfTest[1] && neighborTest[10] === selfTest[10]) { + // Create dates from the date_time component of the files + let neighborDate = new Date(neighborTest[3], neighborTest[4], neighborTest[5], neighborTest[7], neighborTest[8], neighborTest[9]); + let selfDate = new Date(selfTest[3], selfTest[4], selfTest[5], selfTest[7], selfTest[8], selfTest[9]); + + if (selfDate.getTime() > neighborDate.getTime() && offset.includes('-')) { + dupsToMove.push(files[neighbor]); + log(ctx, `${files[neighbor]} duplicate of ${files[i]}`); + } else { + dupsToMove.push(files[i]); + log(ctx, `${files[i]} duplicate of ${files[neighbor]}`); + } + } + } + } + } + + log(ctx, `Moving dups for ${user}`); + dupsToMove.forEach(async dup => { + let dupMove = rename(`${workingDir}/${dup}`, `${workingDir}/dups/${dup}`); + dupMove.catch(err => error(ctx, err)); + await dupMove; + }); + + log(ctx, `Moving finished for ${user}`); + }); +} + +dedupeIds();