目录
1. 查找完全相同的一对张照片
2. 查找相似照片,
1. 查找完全相同的一对张照片
利用MD5,变换找到两张一模一样的图片。
import cv2
import numpy as np
import osimport json
import os
from hashlib import md5def getmd5(image_path, md5_path):# 得到所有图片的路径,加到列表images中files = os.listdir(image_path)#files.pop(files.index('.DS_Store')) # MACimages = [os.path.join(image_path, f) for f in files]# 计算每张图片的md5值,并将{md5值:图片路径}整合到字典image_md5中image_md5 = {}for n in range(len(images)):hash = md5()img = open(images[n], 'rb')hash.update(img.read())img.close()md5_value = hash.hexdigest()image_md5[md5_value] = images[n]with open(md5_path, "w", encoding='utf-8') as fr:json.dump(image_md5, fr, indent=2, sort_keys=True, ensure_ascii=False)def check(md5_val_path, md5_test_path, md5_repeat_path):# 两两比较测试和验证集md5值with open(md5_val_path, "rt", encoding="utf-8") as fr:md5_val = json.load(fr)with open(md5_test_path, "rt", encoding="utf-8") as fr:md5_test = json.load(fr)md5_repeat = {}for md5 in md5_test:if md5 in md5_val:print('[md5]{} [val]{} [test]{}'.format(md5, md5_val[md5], md5_test[md5]))md5_repeat[md5] = {'val': md5_val[md5], 'test': md5_test[md5]}with open(md5_repeat_path, "w", encoding='utf-8') as fr:json.dump(md5_repeat, fr, indent=2, sort_keys=True, ensure_ascii=False)print('------------------ Analysis ------------------')print('Test num {} Val num {} Repeat {}'.format(len(md5_val), len(md5_test), len(md5_repeat)))if __name__ == '__main__':image_val_path = r'H:\testImages\原始图片池'image_test_path = r'H:\testImages待查原始图片池'md5_val_path = 'H:/testImages/md5_val.json'md5_test_path = 'H:/testImages/md5_test.json'md5_repeat_path = 'H:/testImages/md5_repeat.json'getmd5(image_val_path, md5_val_path)getmd5(image_test_path, md5_test_path)check(md5_val_path, md5_test_path, md5_repeat_path)
2. 查找相似照片,
比如,同一张照片,被压缩了或者手机翻拍的两张相同内容的图片进行辨识。比如,JPG压缩位PNG。避免了MD5的要求完全一致的雪崩效应,增强鲁棒性。
主要综合四种指标:
1. 感知哈希(P Hashing 余弦),
2. 平局散列,
3. 梯度散列,
4. 离散小波变换
import os,imagehash
from PIL import Imagedef hash(SourcePath,Test):highfreq_factor = 4 # resize的尺度hash_size = 32 # 最终返回hash数值长度image_scale = 64img_size = hash_size * highfreq_factorlist_file = []list_phash = []list_ahash = []list_dhash = []list_whash = []for file in os.listdir(SourcePath):if os.path.splitext(file)[1] == '.png':path_file = os.path.join(SourcePath, file) # 拼路径list_file.append(file)phash = imagehash.phash(Image.open(path_file),hash_size=hash_size,highfreq_factor=highfreq_factor)#感知哈希(perception hashing)ahash = imagehash.average_hash(Image.open(path_file),hash_size=hash_size)#平均散列(average hashing)dhash = imagehash.dhash(Image.open(path_file),hash_size=hash_size)#梯度散列(difference hashing)whash = imagehash.whash(Image.open(path_file),image_scale=image_scale,hash_size=hash_size,mode = 'db4')#离散小波变换(wavelet hashing)list_phash.append(phash)list_ahash.append(ahash)list_dhash.append(dhash)list_whash.append(whash)list_fileTest = []list_phashTest = []list_ahashTest = []list_dhashTest = []list_whashTest = []for file in os.listdir(Test):if os.path.splitext(file)[1] == '.jpg':path_file = os.path.join(Test, file) # 拼路径list_fileTest.append(file)phash = imagehash.phash(Image.open(path_file), hash_size=hash_size,highfreq_factor=highfreq_factor) # 感知哈希(perception hashing)ahash = imagehash.average_hash(Image.open(path_file), hash_size=hash_size) # 平均散列(average hashing)dhash = imagehash.dhash(Image.open(path_file), hash_size=hash_size) # 梯度散列(difference hashing)whash = imagehash.whash(Image.open(path_file), image_scale=image_scale, hash_size=hash_size,mode='db4') # 离散小波变换(wavelet hashing)list_phashTest.append(phash)list_ahashTest.append(ahash)list_dhashTest.append(dhash)list_whashTest.append(whash)#print(list_hash)for i in range(len(list_fileTest)):for j in range(len(list_file)):phash_value = 1-(list_phashTest[i]-list_phash[j])/len(list_phashTest[i].hash)**2ahash_value = 1-(list_ahashTest[i]-list_ahash[j])/len(list_ahashTest[i].hash)**2dhash_value = 1-(list_dhashTest[i]-list_dhash[j])/len(list_dhashTest[i].hash)**2whash_value = 1-(list_whashTest[i]-list_whash[j])/len(list_whashTest[i].hash)**2value_hash = max(phash_value,ahash_value,dhash_value,whash_value)if(value_hash > 0.97):#阈值设为0.9size_i = os.path.getsize(Test + '\\' + list_fileTest[i])size_j = os.path.getsize(SourcePath + '\\' + list_file[j])#print(list_fileTest[i],str(size_i/1024)+'KB')print(list_file[j],str(size_j/1024)+'KB')#print(value_hash)print('***********************')if __name__ == '__main__':imagesPath1 = r'H:\testImages\原始照片池'imagesPath2 = r'H:\testImages\对照照片池'hash(imagesPath1 ,imagesPath1 )