这是一个棘手的问题。我的磁盘中存储了大约1000张图像,我想通过成对比较来找到彼此相似的图像。所以我需要做大约1000 * 999/2 = 499,500次比较("相似"的属性;不是及物的)。我的问题与如何比较图像无关,而是与如何在比较期间有效地管理我的机器的内存有关。我已经实现了比较函数:
static bool AreSimilar(ImageInfo x, ImageInfo y)
{
// Logic
}
…其中ImageInfo
是保存一张图像信息的类:
class ImageInfo : IDisposable
{
public string Path { get; init; }
public System.Drawing.Image Image { get; init; }
public void Dispose() => Image.Dispose();
}
理想情况下,我希望在内存中加载所有1,000张图像,然后执行嵌套循环并为每对图像调用AreSimilar
方法,但是一次加载所有图像所需的内存远远超过了我的机器的可用内存。图像文件非常大,它们的大小变化很大(大多数的大小在5到50 MB之间)。可用的RAM是2 GB,所以我不能同时加载超过80个图像。从磁盘加载映像很慢。实际上,从磁盘加载两个映像要比比较它们慢得多并找出它们是否相似。
我的问题是如何实现一种方法,该方法将负责从磁盘加载/卸载图像,并成对生成它们,同时利用所有可用内存,但不超过内存限制。下面是我要实现的方法的签名:
static IEnumerable<(TItem, TItem)> GetPairs<TSource, TItem>(
IReadOnlyList<TSource> source,
Func<TSource, long> sizeSelector,
Func<TSource, TItem> itemLoader,
long maxConcurrentSize) where TItem : IDisposable;
TSource
将是文件路径,TItem
将是ImageInfo
。我打算这样使用它:
string[] paths = Directory.GetFiles(@"C:Images", "*.jpg");
var pairs = GetPairs(paths,
path => new FileInfo(path).Length,
path => new ImageInfo() { Path = path, Image = Image.FromFile(path) },
2_000_000_000);
foreach (var (x, y) in pairs)
{
if (AreSimilar(x, y))
Console.WriteLine($"{x.Path} and {y.Path} are similar!");
}
我目前不知道如何实现这个方法。这看起来是一项严肃的任务。我现在所拥有的只是下面的简单版本,它成对加载图像并忽略sizeSelector
和maxConcurrentSize
参数:
static IEnumerable<(TItem, TItem)> GetPairs<TSource, TItem>(
IReadOnlyList<TSource> source,
Func<TSource, long> sizeSelector,
Func<TSource, TItem> itemLoader,
long maxConcurrentSize) where TItem : IDisposable
{
for (int i = 0; i < source.Count; i++)
{
using var first = itemLoader(source[i]);
for (int j = i + 1; j < source.Count; j++)
{
using var second = itemLoader(source[j]);
yield return (first, second);
}
}
}
显然,性能很糟糕,因为每个图像平均加载约500次。
这是一个针对您的问题的解决方案,包含了一个单元测试。不幸的是,在一次只能加载少量图像的情况下,它的性能很差,导致最坏的情况是您建议的解决方案的加载数量的两倍。但是,当一次可以加载大量图像时,此算法开始优于您的算法,随着允许的内存大小的增加,每个图像只能加载1个。
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;
namespace UnitTest;
[TestClass]
public class TestComparison
{
[TestMethod]
public void Test()
{
const int imageCount = 10;
var (totalLoads, max, pairs) = RunTest(1, 5, 1000, imageCount);
Assert.AreEqual(10, totalLoads);
Assert.AreEqual(1, max);
(_, max, var pairs2) = RunTest(5, 5, 12, imageCount);
Assert.AreEqual(9, max);
var expectedPairs = (imageCount - 1) * imageCount / 2;
Assert.AreEqual(expectedPairs, pairs.Count);
Assert.AreEqual(expectedPairs, pairs2.Count);
}
private (int totalLoads, int maxLoads, List<(ImageInfo, ImageInfo)> pairs) RunTest(int minImageSize, int maxImageSize, long memorySize, int imageCount)
{
var images = GenerateImages(imageCount, minImageSize, maxImageSize);
var paths = images.Keys.ToList();
var imageLoadCounts = new Dictionary<string, int>();
var pairs = GetPairs(paths,p=>images[p].Size,LoadImage,memorySize).ToList();
var totalLoads = imageLoadCounts.Values.Sum();
var maxLoad = imageLoadCounts.Values.Max();
return new(totalLoads, maxLoad,pairs);
ImageInfo LoadImage(string path)
{
if (!imageLoadCounts.TryGetValue(path, out int count))
{
count = 0;
}
count++;
imageLoadCounts[path] = count;
return images[path];
}
}
private Dictionary<string, ImageInfo> GenerateImages(int imageCount, int minImageSize, int maxImageSize)
{
var images = new Dictionary<string,ImageInfo>();
for (int i = 0; i < imageCount; i++)
{
images[RandomString()] = new() { Value = _random.NextSingle(), Size = _random.Next(minImageSize, maxImageSize) };
}
return images;
}
class ImageInfo:IDisposable
{
public int Size { get; set; }
public float Value { get; set; }
public void Dispose()
{
}
}
private static readonly Random _random = new();
static string RandomString()
{
const int length = 10;
var str = string.Empty;
for (int i = 0; i < length; i++)
{
str += (char)_random.Next(65, 90);
}
return str;
}
static bool AreSimilar(ImageInfo x, ImageInfo y) => Math.Abs(y.Value-x.Value)<.1f;
record Comparison<T>(T Path1, T Path2)
{
public bool Contains(T path) => Path1.Equals(path) || Path2.Equals(path);
public T Other(T path)
{
if (Path1.Equals(path)) return Path2;
if (Path2.Equals(path)) return Path1;
throw new Exception("invalid path");
}
public bool IsEquivalentTo(Comparison<T> other) => (other.Path1.Equals(Path1) && other.Path2.Equals(Path2)) ||
(other.Path2.Equals(Path1) && other.Path1.Equals(Path2));
}
static IEnumerable<(TItem, TItem)> GetPairs<TSource, TItem>(
IReadOnlyList<TSource> source,
Func<TSource, long> sizeSelector,
Func<TSource, TItem> itemLoader,
long maxConcurrentSize) where TItem : IDisposable
{
var itemCount = source.Count;
var comparisons = new List<Comparison<TSource>>(itemCount * itemCount / 2);
for (int i = 0; i < itemCount - 1; i++)
{
for (int j = i + 1; j < itemCount; j++)
{
comparisons.Add(new(source[i], source[j]));
}
}
return GetPairs(comparisons,sizeSelector,itemLoader,maxConcurrentSize);
}
static IEnumerable<(TItem, TItem)> GetPairs<TSource,TItem>(List<Comparison<TSource>> remainingComparisons,
Func<TSource, long> sizeSelector,
Func<TSource, TItem> itemLoader,
long maxConcurrentSize) where TItem:IDisposable
{
if(!remainingComparisons.Any()) yield break;
var images = LoadImages(remainingComparisons,sizeSelector,itemLoader,maxConcurrentSize);//load as many images as possible from the remaining comparisons
var imageCount = images.Count;
for (int i = 0; i < imageCount - 1; i++)
{
var (path1, image1) = images[i];
for (int j = i + 1; j < imageCount; j++)
{
var (path2, image2) = images[j];
yield return new(image1, image2);
var comparison = new Comparison<TSource>(path1, path2);
remainingComparisons.RemoveAll(c => c.IsEquivalentTo(comparison));
}
}
//dispose
foreach (var image in images.Select(i => i.Image))
{
image.Dispose();
}
images = null;//allow GC
foreach (var pair in GetPairs(remainingComparisons,sizeSelector,itemLoader,maxConcurrentSize))
{
yield return pair;
}
}
/// <summary>
/// Loads as many images into memory as possible from the remaining comparison list
/// </summary>
static List<(TSource Path, TITem Image)> LoadImages<TSource,TITem>(List<Comparison<TSource>> remainingComparisons, Func<TSource, long> sizeSelector,
Func<TSource, TITem> itemLoader,
long maxConcurrentSize)
{
var availableMemory = maxConcurrentSize;
remainingComparisons = remainingComparisons.ToList();//copy the list so we can alter it in local scope
var loadedImages = new List<(TSource Path, TITem Image)>();
if (!TryGetSeed(out var seed)) throw new Exception("One of the images is too large to load into memory");
while (remainingComparisons.Any())
{
if (!remainingComparisons.TryGetFirst(c => c.Contains(seed),out var comparison ))
{
if (!TryGetSeed(out seed)) break;
continue;
}
var other = comparison.Other(seed);
remainingComparisons.Remove(comparison);
if (!TryLoad(other)) break;
}
return loadedImages;
bool TryLoad(TSource path)
{
var fileSize = sizeSelector(path);
if (fileSize > availableMemory) return false;
loadedImages.Add(new(path, itemLoader(path)));
availableMemory -= fileSize;
return true;
}
bool TryGetSeed(out TSource seed)
{
//first, remove any comparisons that are relevant to the current loaded list
var loadedImageCount = loadedImages.Count;
for (int i = 0; i < loadedImageCount-1; i++)
{
for (int j = 1; j < loadedImageCount; j++)
{
var localComparison = new Comparison<TSource>(loadedImages[i].Path, loadedImages[j].Path);
remainingComparisons.RemoveAll(c => c.IsEquivalentTo(localComparison));
}
}
if (!remainingComparisons.TryGetFirst(out var firstRemaining))
{
seed = default;
return false;
}
seed = firstRemaining.Path1;
return TryLoad(seed);
}
}
}
public static class Extensions
{
public static bool TryGetFirst<T>(this IEnumerable<T> items, out T value) =>
items.TryGetFirst(t => true, out value);
public static bool TryGetFirst<T>(this IEnumerable<T> items, Predicate<T> condition, out T value)
{
foreach (var item in items)
{
if (condition(item))
{
value = item;
return true;
}
}
value = default;
return false;
}
}
为了回答你的问题,忽略了时间复杂度。TryGetSeed
的当前实现使时间复杂度为O(N^3),但这很容易改进。我怀疑同样的算法可以在O(N^2)时间内写出来,这是这个问题的最佳时间复杂度。
我设法想出了一个紧凑的算法来解决这个相当困难的问题。该算法维护5条信息作为状态:
- 当前加载的项列表。
- 当前加载项的总大小。
- 当前未加载项的队列。
- 一个数组,包含每个项目的剩余配对数。
- 一个
BitArray
,存储是否有任何特定的pair被发射。
算法在两个集合loaded
和unloaded
之间移动项,根据maxConcurrentSize
策略加载和卸载项,从loaded
列表中产生尽可能多的对,直到两个集合都为空。在这一点上,只要算法是正确的,预计会发出所有可能的对。
public static IEnumerable<(TItem, TItem)> GetPairs<TSource, TItem>(
IReadOnlyList<TSource> source,
Func<TSource, long> sizeSelector,
Func<TSource, TItem> itemLoader,
long maxConcurrentSize) where TItem : IDisposable
{
List<(int Index, TItem Item, long Size)> loaded = new();
try
{
long loadedSumSize = 0L;
Queue<int> unloaded = new(Enumerable.Range(0, source.Count));
int[] pending = new int[source.Count];
for (int i = 0; i < pending.Length; i++) pending[i] = source.Count - 1;
BitArray emittedPairs = new(checked(source.Count * (source.Count - 1) >> 1));
while (unloaded.Count > 0)
{
// Select the next item to load, in FIFO order.
int indexToLoad = unloaded.Dequeue();
long sizeToLoad = Math.Max(0L, sizeSelector(source[indexToLoad]));
// Unload already loaded items until there is enough space for the
// new item, in LIFO order.
while (loaded.Count > 1 && loadedSumSize + sizeToLoad > maxConcurrentSize)
{
var toUnload = loaded[loaded.Count - 1];
loaded.RemoveAt(loaded.Count - 1);
toUnload.Item.Dispose();
unloaded.Enqueue(toUnload.Index);
loadedSumSize -= toUnload.Size;
}
// Load the new item.
var itemToLoad = itemLoader(source[indexToLoad]);
loaded.Add((indexToLoad, itemToLoad, sizeToLoad));
checked { loadedSumSize += sizeToLoad; }
var justLoaded = loaded[loaded.Count - 1];
// Yield pairs constituted of the new item with all already loaded items.
for (int i = 0; i < loaded.Count - 1; i++)
{
var existing = loaded[i];
Debug.Assert(existing.Index != justLoaded.Index);
// Switch the order in case the pair is not in the correct order.
var (x, y) = existing.Index < justLoaded.Index ?
(existing, justLoaded) : (justLoaded, existing);
// Emit the pair if it's not already emitted.
int emittedIndex = (y.Index * (y.Index - 1) >> 1) + x.Index;
if (emittedPairs[emittedIndex]) continue;
yield return (x.Item, y.Item);
emittedPairs[emittedIndex] = true;
pending[x.Index]--;
pending[y.Index]--;
}
// Unload all items that are completed.
for (int i = loaded.Count - 1; i >= 0; i--)
{
var (index, item, size) = loaded[i];
if (pending[index] > 0) continue;
Debug.Assert(pending[index] == 0);
loaded.RemoveAt(i);
item.Dispose();
loadedSumSize -= size;
}
}
// If the algorithm is correct, these final assertions should never fail.
Debug.Assert(loaded.Count == 0);
Debug.Assert(loadedSumSize == 0L);
Debug.Assert(pending.All(n => n == 0));
Debug.Assert(emittedPairs.Cast<bool>().All(b => b));
}
finally
{
// Ensure that in case the enumerable is enumerated only partially,
// all currently loaded items will be disposed.
foreach (var entry in loaded) entry.Item.Dispose();
}
}
我通过编写原始问题的模拟来测量此实现的效率:
- 1000个文件,每个文件的大小在5到50 MB之间。
maxConcurrentSize
= 2gb
var source = Enumerable.Range(1, 1000).ToArray();
var random = new Random(0);
var sizes = source.ToDictionary(x => x,
_ => (long)random.Next(5_000_000, 50_000_000));
int loaderInvokedCount = 0;
var query = GetPairs(source, x => sizes[x], x =>
{
loaderInvokedCount++;
return new MemoryStream();
}, 2_000_000_000);
var emittedPairsCount = query.Count();
Console.WriteLine($"Emitted {emittedPairsCount:#,0} pairs");
Console.WriteLine($"Loader invoked {loaderInvokedCount:#,0} times");
输出:
Emitted 499,500 pairs
Loader invoked 7,682 times
现场演示。
作为问题的一部分提供的幼稚实现对相同的输入调用加载器500,500次,与此实现相比多了65倍。获得98.5%的折扣是相当令人满意的结果。