visit
Today you're going to learn how to use Python programming in a way that can ultimately save a lot of space on your drive by removing all the duplicates.
In many situations you may find yourself having duplicates files on your disk and but when it comes to tracking and checking them manually it can tedious.
Heres a solution
Instead of tracking throughout your disk to see if there is a duplicate, you can automate the process using coding, by writing a program to recursively track through the disk and remove all the found duplicates and that's what this article is about.But How do we do it?
If we were to read the whole file and then compare it to the rest of the files recursively through the given directory it will take a very long time, then how do we do it?The answer is hashing, with hashing can generate a given string of letters and numbers which act as the identity of a given file and if we find any other file with the same identity we gonna delete it.
There's a variety of hashing algorithms out there such as
Hashing in Python is pretty straight forward we are going to use hashlib library which comes by default with Python standard library
Below is an example of how we hash stuff using hashlib, we are going to hash of a string in Python using md5 hashing algorithmsExample of Usage>>> import hashlib
>>> example_text = "Duplython is amazing".encode('utf-8')
>>> hashlib.md5(example_text).hexdigest()
'73a14f46eadcc04f4e04bec8eb66f2ab'
Hashing File
Let's say you have simple text document on your project directory with name learn.txt, This is how we will do it.>>> import hashlib
>>> file = open('learn.txt', 'rb').read()
>>> hashlib.md5(file).hexdigest()
'0534cf6d5816c4f1ace48fff75f616c9'
Example of Usage
>>> import hashlib
>>> block_size = 1024
>>> hash = hashlib.md5()
>>> with open('learn.txt', 'rb') as file:
... block = file.read(block_size)
... while len(block)>0:
... hash.update(block)
... block = file.read(block_size)
... print(hash)
...
0534cf6d5816c4f1ace48fff75f616c9
>>> import os
>>> os.listdir()
['Desktop-File-cleaner', '.git', 'learn.txt', 'app.py', 'README.md']
>>> os.remove('learn.txt')
>>> os.listdir()
['Desktop-File-cleaner', '.git', 'app.py', 'README.md']
Well, that’s simple you just call remove ( ) with a parameter of the name of the file you wanna remove done. now let’s go build our application.
import time
import os
from hashlib import sha256
I'm a huge fan of Object-oriented programming and on this article, we gonna build our tool as a single class, below is just as exoskeleton class for our code.
import time
import os
from hashlib import sha256
class Duplython:
def __init__(self):
self.home_dir = os.getcwd(); self.File_hashes = []
self.Cleaned_dirs = []; self.Total_bytes_saved = 0
self.block_size = 65536; self.count_cleaned = 0
def welcome(self)->None:
print('******************************************************************')
print('**************** DUPLYTHON ****************************')
print('********************************************************************\n\n')
print('---------------- WELCOME ----------------------------')
time.sleep(3)
print('\nCleaning .................')
def main(self)->None:
self.welcome()
if __name__ == '__main__':
App = Duplython()
App.main()
$ python3 app.py
******************************************************************
**************** DUPLYTHON ****************************
********************************************************************
---------------- WELCOME ----------------------------
Cleaning .................
We now have to create a simple function to generate hash of a file with a given path using the hashing knowledge we have learned above.
import time
import os
from hashlib import sha256
class Duplython:
def __init__(self):
self.home_dir = os.getcwd(); self.File_hashes = []
self.Cleaned_dirs = []; self.Total_bytes_saved = 0
self.block_size = 65536; self.count_cleaned = 0
def welcome(self)->None:
print('******************************************************************')
print('**************** DUPLYTHON ****************************')
print('********************************************************************\n\n')
print('---------------- WELCOME ----------------------------')
time.sleep(3)
print('\nCleaning .................')
def generate_hash(self, Filename:str)->str:
Filehash = sha256()
try:
with open(Filename, 'rb') as File:
fileblock = File.read(self.block_size)
while len(fileblock)>0:
Filehash.update(fileblock)
fileblock = File.read(self.block_size)
Filehash = Filehash.hexdigest()
return Filehash
except:
return False
def main(self)->None:
self.welcome()
if __name__ == '__main__':
App = Duplython()
App.main()
import time
import os
from hashlib import sha256
class Duplython:
def __init__(self):
self.home_dir = os.getcwd(); self.File_hashes = []
self.Cleaned_dirs = []; self.Total_bytes_saved = 0
self.block_size = 65536; self.count_cleaned = 0
def welcome(self)->None:
print('******************************************************************')
print('**************** DUPLYTHON ****************************')
print('********************************************************************\n\n')
print('---------------- WELCOME ----------------------------')
time.sleep(3)
print('\nCleaning .................')
def generate_hash(self, Filename:str)->str:
Filehash = sha256()
try:
with open(Filename, 'rb') as File:
fileblock = File.read(self.block_size)
while len(fileblock)>0:
Filehash.update(fileblock)
fileblock = File.read(self.block_size)
Filehash = Filehash.hexdigest()
return Filehash
except:
return False
def clean(self)->None:
all_dirs = [path[0] for path in os.walk('.')]
for path in all_dirs:
os.chdir(path)
All_Files =[file for file in os.listdir() if os.path.isfile(file)]
for file in All_Files:
filehash = self.generate_hash(file)
if not filehash in self.File_hashes:
if filehash:
self.File_hashes.append(filehash)
#print(file)
else:
byte_saved = os.path.getsize(file); self.count_cleaned+=1
self.Total_bytes_saved+=byte_saved
os.remove(file); filename = file.split('/')[-1]
print(filename, '.. cleaned ')
os.chdir(self.home_dir)
def main(self)->None:
self.welcome();self.clean()
if __name__ == '__main__':
App = Duplython()
App.main()
I have implemented the method cleaning_summary () just to do that. Print out the summary of the cleaning process to the screen which completes our python tool as shown below:
import time
import os
import shutil
from hashlib import sha256
class Duplython:
def __init__(self):
self.home_dir = os.getcwd(); self.File_hashes = []
self.Cleaned_dirs = []; self.Total_bytes_saved = 0
self.block_size = 65536; self.count_cleaned = 0
def welcome(self)->None:
print('******************************************************************')
print('**************** DUPLYTHON ****************************')
print('********************************************************************\n\n')
print('---------------- WELCOME ----------------------------')
time.sleep(3)
print('\nCleaning .................')
def generate_hash(self, Filename:str)->str:
Filehash = sha256()
try:
with open(Filename, 'rb') as File:
fileblock = File.read(self.block_size)
while len(fileblock)>0:
Filehash.update(fileblock)
fileblock = File.read(self.block_size)
Filehash = Filehash.hexdigest()
return Filehash
except:
return False
def clean(self)->None:
all_dirs = [path[0] for path in os.walk('.')]
for path in all_dirs:
os.chdir(path)
All_Files =[file for file in os.listdir() if os.path.isfile(file)]
for file in All_Files:
filehash = self.generate_hash(file)
if not filehash in self.File_hashes:
if filehash:
self.File_hashes.append(filehash)
#print(file)
else:
byte_saved = os.path.getsize(file); self.count_cleaned+=1
self.Total_bytes_saved+=byte_saved
os.remove(file); filename = file.split('/')[-1]
print(filename, '.. cleaned ')
os.chdir(self.home_dir)
def cleaning_summary(self)->None:
mb_saved = self.Total_bytes_saved/1048576
mb_saved = round(mb_saved, 2)
print('\n\n--------------FINISHED CLEANING ------------')
print('File cleaned : ', self.count_cleaned)
print('Total Space saved : ', mb_saved, 'MB')
print('-----------------------------------------------')
def main(self)->None:
self.welcome();self.clean();self.cleaning_summary()
if __name__ == '__main__':
App = Duplython()
App.main()
$ python3 app.py
******************************************************************
**************** DUPLYTHON ****************************
********************************************************************
---------------- WELCOME ----------------------------
Cleaning .................
0(copy).jpeg .. cleaned
0 (1)(copy).jpeg .. cleaned
0 (2)(copy).jpeg .. cleaned
--------------FINISHED CLEANING ------------
File cleaned : 3
Total Space saved : 0.38 MB
-----------------------------------------------
This article is also published on .