photo26/photologue/management/commands/duplicate.py
2022-03-02 21:23:40 +01:00

79 lines
2.8 KiB
Python

import hashlib
from django.core.management.base import BaseCommand, CommandError
from photologue.models import Gallery
class Command(BaseCommand):
help = "List all duplicate for chosen galleries"
def add_arguments(self, parser):
parser.add_argument(
"--slugs",
nargs="+",
help="Try to find duplicate in the selected galleries",
default=[],
)
parser.add_argument(
"-a",
"--all",
action="store_true",
help="Try to find duplicate in all galleries, overide any slugs given",
)
parser.add_argument("-d", "--delete", action="store_true")
def handle(self, *args, **options):
# Collect all required galleries
if options["all"]:
galleries = Gallery.objects.all()
else:
galleries = []
for slug in options["slugs"]:
gallery_query = Gallery.objects.filter(slug=slug)
if not gallery_query:
raise CommandError(
f"Slug {slug} does not correspond to a "
"gallery in the database."
)
galleries += gallery_query
# Find duplicates in all galleries
for gallery in galleries:
duplicates = find_duplicate(gallery)
self.stdout.write(f"Gallery {gallery.slug}:")
for original, copies in duplicates:
self.stdout.write(f" {original.slug} is duplicated:", ending="")
for copy in copies:
self.stdout.write(f" {copy.slug}")
# Delete them if --delete
if options["delete"]:
self.stdout.write(" Deleting duplicate in {} :".format(gallery.slug))
for (_original, copies) in duplicates:
for copy in copies:
self.stdout.write(" Deleting {}...".format(copy.slug))
copy.delete()
def find_duplicate(gallery):
# Dict of all already checked photos
non_duplicate = {}
# Dict of all found duplicate {h0 : (original:[duplicates])}
duplicate = {}
for photo in gallery.photos.all():
with photo.image.open("rb") as f:
h0 = hashlib.sha256(f.read()).digest()
if h0 not in non_duplicate:
# Photo is not a duplicate
non_duplicate[h0] = photo
elif h0 in duplicate:
if len(photo.slug) > len(duplicate[h0][0].slug):
duplicate[h0][1] += [photo]
else:
duplicate[h0][1] += [duplicate[h0][0]]
duplicate[h0][0] = photo
else:
duplicate[h0] = [non_duplicate[h0], [photo]]
# Return only value because hash aren't usefull
return duplicate.values()