按频率计数对电子邮件地址数组进行排序



我有一个数组,里面有一些重复项。一种基于重复计数对数组进行排序的有效算法,例如

['d@me.com', 'z@gmail.com', 'e@me.com', 'b@me.com', 'c@me.com', 'z@gmail.com', 'z@gmail.com', 'b@me.com', 'e@me.com']
=>
['z@gmail.com', 'e@me.com', 'b@me.com', 'd@me.com', 'c@me.com']

因为计数如下[3, 2, 2, 1, 1]

我想出了:

const itemCounts = {}
const ordereditems = []
for (let i = 0; i < allitems.length; i++) {
  let item = allitems[i];
  itemCounts[item] = itemCounts[item] ? itemCounts[item] + 1 : 1
}
const tuples = []
for (let key in itemCounts) {
  tuples.push([key, itemCounts[key]])
}
return tuples.sort((a, b) => a[1] < b[1]).map(x => x[0])

这是关于Θ(3 N + N log N)

也许洛达什更快?

也许在我计数时保持排序的优先级队列?

也许使用某种基数?

这是我

的尝试,第一次尝试编写一个片段:)

var allitems = ['d', 'a', 'e', 'b', 'c', 'a', 'a', 'b', 'e'];
function original_code(allitems) {
  const itemCounts = {}
  const ordereditems = []
  for (let i = 0; i < allitems.length; i++) {
    let item = allitems[i];
    itemCounts[item] = itemCounts[item] ? itemCounts[item] + 1 : 1
  }
  const tuples = []
  for (let key in itemCounts) {
    tuples.push([key, itemCounts[key]])
  }
  return tuples.sort((a, b) => a[1] < b[1]).map(x => x[0])
}
function myTry(allitems) {
  var arr;
  const dic = {};
  arr = [];
  for (let i = 0; i < allitems.length; i++) {
    let item = allitems[i];
    if (!dic.hasOwnProperty(item)) {
      dic[item] = 1;
      arr.push(item);
    } else
      dic[item]++;
  }
  arr.sort((a, b) => dic[b] - dic[a]);
  return arr;
}
//measure attempts
{ //original code
  var t0 = performance.now();
  var res;
  for (let j = 0; j < 10000; j++) {
    res = original_code(allitems);
  }
  var t1 = performance.now();
  console.log("original " + (t1 - t0) + " milliseconds.");
  console.log("original " + res);
}
{ //my try
  var t0 = performance.now();
  var res;
  for (let j = 0; j < 10000; j++) {
    res = myTry(allitems);
  }
  var t1 = performance.now();
  console.log("myTry " + (t1 - t0) + " milliseconds.");
  console.log("myTry " + res);
}
{ //my try
  var t0 = performance.now();
  var res;
  for (let j = 0; j < 10000; j++) {
    res = myTry(allitems);
  }
  var t1 = performance.now();
  console.log("myTry2 " + (t1 - t0) + " milliseconds.");
  console.log("myTry2 " + res);
}
{ //original code
  var t0 = performance.now();
  var res;
  for (let j = 0; j < 10000; j++) {
    res = original_code(allitems);
  }
  var t1 = performance.now();
  console.log("original2 " + (t1 - t0) + " milliseconds.");
  console.log("original2 " + res);
}

编辑
我试图进行更可靠的测量。如果有更好的方法,如果您告诉我,我将不胜感激。

+更改了排序。

使用 array#reduce 创建一个频率对象,其中包含数组中该单词的单词和频率。然后根据对象值对值进行排序,然后取出所有单词。

var arr = ['d', 'a', 'e', 'b', 'c', 'a', 'a', 'b', 'e'],
    frequency = arr.reduce((r,val) => {
      r[val] = r[val] || {val, count : 0};
      r[val].count = r[val].count + 1;
      return r;
    },{});
var result = Object.values(frequency).sort((a,b) => {
  return b.count - a.count;
}).map(({val}) => val);
console.log(result);

我直觉地期望对数线性是这个算法规范的最佳选择,但实际上这是一个(摊销的(O(n(实现的要点,我惊讶地想到了,使用哈希表(JS中的对象(的魔力!这是以占用更多空间为代价的。您需要双链表实现。我会在这里破解一个,但可能有更适合的库。

printList函数用于调试,如果您有兴趣了解其工作原理,可能会很有用。

function orderByCount(allitems) { // allitems is the array to be 'count-sorted'
  // linkedList will have at most one node per item. It will be used to build the result.
  // It will be ordered by count (seen so far) descending
  let linkedListHead = null
  let linkedListTail = null
  // if linkedList has a node for key k, itemIndex[k] is that node
  // if linkedList has no node for key k, itemIndex[k] is undefined
  let itemIndex = {}
  // for all x >= 1 if linkedList has a node with seen count <= x, countIndex[x] is the 'headmost' (first) node whose count <= x.
  // for all x >= 1 if linkedList has no node with seen count <= x, countIndex[x] is undefined
  let countIndex = {}
  // iterate over the input and maintain above invariants
  for (let i = 0; i < allitems.length; i++) {
    let item = allitems[i];
    if (itemIndex.hasOwnProperty(item)) {
      // we've already seen this item; update while preserving invariants
      const linkedNode = itemIndex[item]
      // first, remove the node, preserving invariants
      if (countIndex[linkedNode.count] === linkedNode) {
        countIndex[linkedNode.count] = linkedNode.next
      }
      if (linkedNode.previous) {
        linkedNode.previous.next = linkedNode.next
      } else {
        linkedListHead = linkedNode.next
      }
      if (linkedNode.next) {
        linkedNode.next.previous = linkedNode.previous
      } else {
        linkedListTail = linkedNode.previous
      }
      linkedNode.next = linkedNode.previous = null
      // now update the node
      linkedNode.count++
        // and add it back where it now belongs, preserving invariants
        if (countIndex.hasOwnProperty(linkedNode.count)) {
          // need to put it at the front of this count-block
          newNext = countIndex[linkedNode.count]
          newPrevious = newNext.previous
          linkedNode.next = newNext
          linkedNode.previous = newPrevious
          newNext.previous = linkedNode
          if (newPrevious) {
            newPrevious.next = linkedNode
          } else {
            linkedListHead = linkedNode
          }
          countIndex[linkedNode.count] = linkedNode
        } else {
          // it's the new greatest; create a new count-block
          countIndex[linkedNode.count] = linkedNode
          if (linkedListHead) {
            linkedListHead.previous = linkedNode
          }
          linkedNode.next = linkedListHead
          linkedListHead = linkedNode
        }
    } else {
      // this is a new item
      const linkedNode = {
        item: item,
        count: 1,
        previous: null,
        next: null
      }
      // First, insert it at the tail
      linkedNode.previous = linkedListTail
      if (linkedListTail) {
        linkedListTail.next = linkedNode
      }
      linkedListTail = linkedNode
      // now index it
      itemIndex[item] = linkedNode
      if (linkedListHead === null) {
        linkedListHead = linkedNode
      }
      if (!countIndex.hasOwnProperty(1)) {
        countIndex[1] = linkedNode
      }
    }
  }
  // turn it into a normal array as per specification 
  const result = []
  let current = linkedListHead
  while (current != null) {
    result.push(current.item)
    current = current.next
  }
  return result
}
function printList(linkedListHead, linkedListTail, countIndex) {
  let current = linkedListHead
  while (current != null) {
    toLog = JSON.stringify({
      item: current.item,
      count: current.count
    })
    if (linkedListHead === current) {
      toLog += " (HEAD)"
    }
    if (linkedListTail === current) {
      toLog += " (TAIL)"
    }
    if (countIndex[current.count] === current) {
      toLog += " <-- " + current.count
    }
    console.log(toLog)
    current = current.next
  }
  console.log()
}
const allItems = ['d', 'a', 'e', 'b', 'c', 'a', 'a', 'b', 'e']
console.log(orderByCount(allItems))

我没有测量过任何时间。但这里有一个想法,使用 filter 函数创建一个 uniqueitems 项数组,然后使用它来零填充 itemcounts 数组,从而从计数循环中删除分支。

const allitems= ['d', 'a', 'e', 'b', 'c', 'a', 'a', 'b', 'e'];
const itemCounts = {};
var uniqueitems = [];
var ordereditems = [];
// Filter out unique item list early
uniqueitems=allitems.filter( (v,i,s) => s.indexOf(v) === i );
// Set items counts of unique items to zero,
for (let i = 0; i < uniqueitems.length; i++) {
  let item = allitems[i];
  itemCounts[item] =0;
}
// Count occurences in allitems
for (let i = 0; i < allitems.length; i++) {
  let item = allitems[i];
  itemCounts[item]  ++;
}
// Sort unique items,directly based on their itemCount
ordereditems=uniqueitems.sort((a, b) => itemCounts[a] < itemCounts[b]);
console.log(ordereditems);

我认为这是一个更好的解决方案,使用arr.concat而不是排序。

请注意,对象中的键按顺序排列:

const f = (allitems) => {
  const itemCounts = {}
  for (let i = 0; i < allitems.length; i++) {
    let item = allitems[i];
    itemCounts[item] = itemCounts[item] ? itemCounts[item] + 1 : 1
  }
  
  const map = {}
  for (let key in itemCounts) {
    const count = itemCounts[key]
    if(!map[count]) {
      map[count] = []
    }
    map[count].push(key)
  }
  
  let ordereditems = []
  for (let key in map) {
    ordereditems = ordereditems.concat(map[key])
  }
  return ordereditems.reverse()
}
console.log(f(['d', 'a', 'e', 'b', 'c', 'a', 'a', 'b', 'e']))

对此没有太多测试用例,因此它可能不是很健壮。

['d', 'a', 'e', 'b', 'c', 'a', 'a', 'b', 'e'].sort((x,y)=> x.charCodeAt()-y.charCodeAt())