import { mkdir, readdir, rename } from 'fs/promises'; import { getArg } from './lib/args.js'; import { getChildDirectories } from './lib/io.js'; import { error, log } from './lib/log.js'; const ctx = 'dedupeIds.js'; const idDateTimeRegex = new RegExp('([0-9]*)-(([0-9]{4})([0-9]{2})([0-9]{2}))_(([0-9]{2})([0-9]{2})([0-9]{2}))-((img|vid|gif)([0-9]*))'); /** * Reads through the db directory and looks for duplicate entries by id. * * If they have the same id, it will use the provided offset to determine which time is correct. * - If `--offset="-"`, it will mark the file with the older date as a duplicate. * - If `--offset="+"`, it will mark the file with the newer date as a duplicate. * * This exists because `twittermediadownloader` used local time when saving files, which would possibly not be fixed because of DST. * This repo uses UTC when saving times for the sake of consistency. */ const dedupeIds = async () => { let directory, offset; try { directory = getArg('path'); offset = getArg('offset'); } catch (err) { error(ctx, err); return; } log(ctx, 'Reading directories'); const userDirs = await getChildDirectories(directory); userDirs.forEach(async user => { let workingDir = `${directory}/${user}`; try { await mkdir(`${workingDir}/dups`); } catch (err) { if (!err.toString().includes('EEXIST')) { error(ctx, err); } } log(ctx, `Finding dups for ${user}`); const files = await readdir(workingDir); const dupsToMove = []; for (let i = 0; i < files.length; ++i) { // Skip when the current file is already in dupsToMove if (dupsToMove.includes(files[i])) { continue; } // Go back/fourth ~5 images, as the max per post is 4; this should cover enough files without too much looping let neighborMin = Math.min(Math.max(i - 5, 0), files.length - 1); let neighborMax = Math.min(Math.max(i + 5, 0), files.length - 1); for (let neighbor = neighborMin; neighbor < neighborMax; ++neighbor) { if (dupsToMove.includes(files[neighbor]) || neighbor === i) { continue; } // Test the neighbor and self for regex pattern id-date_time-typeCount.extension let neighborTest = idDateTimeRegex.exec(files[neighbor]); let selfTest = idDateTimeRegex.exec(files[i]); if (!!neighborTest && !!selfTest) { // If the id and the typeCount (e.g. img1 === img1) are the same, then the file is a dup if (neighborTest[1] === selfTest[1] && neighborTest[10] === selfTest[10]) { // Create dates from the date_time component of the files let neighborDate = new Date(neighborTest[3], neighborTest[4], neighborTest[5], neighborTest[7], neighborTest[8], neighborTest[9]); let selfDate = new Date(selfTest[3], selfTest[4], selfTest[5], selfTest[7], selfTest[8], selfTest[9]); if (selfDate.getTime() > neighborDate.getTime() && offset.includes('-')) { dupsToMove.push(files[neighbor]); log(ctx, `${files[neighbor]} duplicate of ${files[i]}`); } else { dupsToMove.push(files[i]); log(ctx, `${files[i]} duplicate of ${files[neighbor]}`); } } } } } log(ctx, `Moving dups for ${user}`); dupsToMove.forEach(async dup => { let dupMove = rename(`${workingDir}/${dup}`, `${workingDir}/dups/${dup}`); dupMove.catch(err => error(ctx, err)); await dupMove; }); log(ctx, `Moving finished for ${user}`); }); } dedupeIds();