use UTC for timestamps and provide deduplicator
This commit is contained in:
parent
ef16b61219
commit
060d49261b
2 changed files with 98 additions and 4 deletions
|
@ -6,9 +6,9 @@
|
|||
"{author['name']}"
|
||||
],
|
||||
"filename": {
|
||||
"locals().get('bitrate', 0) > 0": "{author['name']}-{tweet_id}-{date:Olocal/%Y%m%d_%H%M%S}-vid{num}.{extension}",
|
||||
"locals().get('bitrate') == 0": "{author['name']}-{tweet_id}-{date:Olocal/%Y%m%d_%H%M%S}-gif{num}.{extension}",
|
||||
"": "{author['name']}-{tweet_id}-{date:Olocal/%Y%m%d_%H%M%S}-img{num}.{extension}"
|
||||
"locals().get('bitrate', 0) > 0": "{author['name']}-{tweet_id}-{date:%Y%m%d_%H%M%S}-vid{num}.{extension}",
|
||||
"locals().get('bitrate') == 0": "{author['name']}-{tweet_id}-{date:%Y%m%d_%H%M%S}-gif{num}.{extension}",
|
||||
"": "{author['name']}-{tweet_id}-{date:%Y%m%d_%H%M%S}-img{num}.{extension}"
|
||||
},
|
||||
"cookies": "$HOME/.config/gallery-dl/twitter-cookies.txt",
|
||||
"timeline": {
|
||||
|
@ -18,7 +18,7 @@
|
|||
{
|
||||
"name": "metadata",
|
||||
"event": "post",
|
||||
"filename": "{author[name]}_{tweet_id}-{date:Olocal/%Y%m%d_%H%M%S}.json"
|
||||
"filename": "{author['name']}-{tweet_id}-{date:%Y%m%d_%H%M%S}.json"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
94
run-dedupeIds.js
Normal file
94
run-dedupeIds.js
Normal file
|
@ -0,0 +1,94 @@
|
|||
import { mkdir, readdir, rename } from 'fs/promises';
|
||||
|
||||
import { getArg } from './lib/args.js';
|
||||
import { getChildDirectories } from './lib/io.js';
|
||||
import { error, log } from './lib/log.js';
|
||||
|
||||
const ctx = 'dedupeIds.js';
|
||||
const idDateTimeRegex = new RegExp('([0-9]*)-(([0-9]{4})([0-9]{2})([0-9]{2}))_(([0-9]{2})([0-9]{2})([0-9]{2}))-((img|vid|gif)([0-9]*))');
|
||||
|
||||
/**
|
||||
* Reads through the db directory and looks for duplicate entries by id.
|
||||
*
|
||||
* If they have the same id, it will use the provided offset to determine which time is correct.
|
||||
* - If `--offset="-"`, it will mark the file with the older date as a duplicate.
|
||||
* - If `--offset="+"`, it will mark the file with the newer date as a duplicate.
|
||||
*
|
||||
* This exists because `twittermediadownloader` used local time when saving files, which would possibly not be fixed because of DST.
|
||||
* This repo uses UTC when saving times for the sake of consistency.
|
||||
*/
|
||||
const dedupeIds = async () => {
|
||||
let directory, offset;
|
||||
try {
|
||||
directory = getArg('path');
|
||||
offset = getArg('offset');
|
||||
} catch (err) {
|
||||
error(ctx, err);
|
||||
return;
|
||||
}
|
||||
|
||||
log(ctx, 'Reading directories');
|
||||
const userDirs = await getChildDirectories(directory);
|
||||
|
||||
userDirs.forEach(async user => {
|
||||
let workingDir = `${directory}/${user}`;
|
||||
try {
|
||||
await mkdir(`${workingDir}/dups`);
|
||||
} catch (err) {
|
||||
if (!err.toString().includes('EEXIST')) {
|
||||
error(ctx, err);
|
||||
}
|
||||
}
|
||||
|
||||
log(ctx, `Finding dups for ${user}`);
|
||||
const files = await readdir(workingDir);
|
||||
const dupsToMove = [];
|
||||
|
||||
for (let i = 0; i < files.length; ++i) {
|
||||
// Skip when the current file is already in dupsToMove
|
||||
if (dupsToMove.includes(files[i])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Go back/fourth ~5 images, as the max per post is 4; this should cover enough files without too much looping
|
||||
let neighborMin = Math.min(Math.max(i - 5, 0), files.length - 1);
|
||||
let neighborMax = Math.min(Math.max(i + 5, 0), files.length - 1);
|
||||
for (let neighbor = neighborMin; neighbor < neighborMax; ++neighbor) {
|
||||
if (dupsToMove.includes(files[neighbor]) || neighbor === i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Test the neighbor and self for regex pattern id-date_time-typeCount.extension
|
||||
let neighborTest = idDateTimeRegex.exec(files[neighbor]);
|
||||
let selfTest = idDateTimeRegex.exec(files[i]);
|
||||
if (!!neighborTest && !!selfTest) {
|
||||
// If the id and the typeCount (e.g. img1 === img1) are the same, then the file is a dup
|
||||
if (neighborTest[1] === selfTest[1] && neighborTest[10] === selfTest[10]) {
|
||||
// Create dates from the date_time component of the files
|
||||
let neighborDate = new Date(neighborTest[3], neighborTest[4], neighborTest[5], neighborTest[7], neighborTest[8], neighborTest[9]);
|
||||
let selfDate = new Date(selfTest[3], selfTest[4], selfTest[5], selfTest[7], selfTest[8], selfTest[9]);
|
||||
|
||||
if (selfDate.getTime() > neighborDate.getTime() && offset.includes('-')) {
|
||||
dupsToMove.push(files[neighbor]);
|
||||
log(ctx, `${files[neighbor]} duplicate of ${files[i]}`);
|
||||
} else {
|
||||
dupsToMove.push(files[i]);
|
||||
log(ctx, `${files[i]} duplicate of ${files[neighbor]}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log(ctx, `Moving dups for ${user}`);
|
||||
dupsToMove.forEach(async dup => {
|
||||
let dupMove = rename(`${workingDir}/${dup}`, `${workingDir}/dups/${dup}`);
|
||||
dupMove.catch(err => error(ctx, err));
|
||||
await dupMove;
|
||||
});
|
||||
|
||||
log(ctx, `Moving finished for ${user}`);
|
||||
});
|
||||
}
|
||||
|
||||
dedupeIds();
|
Loading…
Add table
Reference in a new issue