deterministicGrouping.js 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. "use strict";
  2. // Simulations show these probabilities for a single change
  3. // 93.1% that one group is invalidated
  4. // 4.8% that two groups are invalidated
  5. // 1.1% that 3 groups are invalidated
  6. // 0.1% that 4 or more groups are invalidated
  7. //
  8. // And these for removing/adding 10 lexically adjacent files
  9. // 64.5% that one group is invalidated
  10. // 24.8% that two groups are invalidated
  11. // 7.8% that 3 groups are invalidated
  12. // 2.7% that 4 or more groups are invalidated
  13. //
  14. // And these for removing/adding 3 random files
  15. // 0% that one group is invalidated
  16. // 3.7% that two groups are invalidated
  17. // 80.8% that 3 groups are invalidated
  18. // 12.3% that 4 groups are invalidated
  19. // 3.2% that 5 or more groups are invalidated
  20. /**
  21. *
  22. * @param {string} a key
  23. * @param {string} b key
  24. * @returns {number} the similarity as number
  25. */
  26. const similarity = (a, b) => {
  27. const l = Math.min(a.length, b.length);
  28. let dist = 0;
  29. for (let i = 0; i < l; i++) {
  30. const ca = a.charCodeAt(i);
  31. const cb = b.charCodeAt(i);
  32. dist += Math.max(0, 10 - Math.abs(ca - cb));
  33. }
  34. return dist;
  35. };
  36. /**
  37. * @param {string} a key
  38. * @param {string} b key
  39. * @returns {string} the common part and a single char for the difference
  40. */
  41. const getName = (a, b) => {
  42. const l = Math.min(a.length, b.length);
  43. let r = "";
  44. for (let i = 0; i < l; i++) {
  45. const ca = a.charAt(i);
  46. const cb = b.charAt(i);
  47. r += ca;
  48. if (ca === cb) {
  49. continue;
  50. }
  51. return r;
  52. }
  53. return a;
  54. };
  55. /**
  56. * @template T
  57. */
  58. class Node {
  59. /**
  60. * @param {T} item item
  61. * @param {string} key key
  62. * @param {number} size size
  63. */
  64. constructor(item, key, size) {
  65. this.item = item;
  66. this.key = key;
  67. this.size = size;
  68. }
  69. }
  70. /**
  71. * @template T
  72. */
  73. class Group {
  74. /**
  75. * @param {Node<T>[]} nodes nodes
  76. * @param {number[]} similarities similarities between the nodes (length = nodes.length - 1)
  77. */
  78. constructor(nodes, similarities) {
  79. this.nodes = nodes;
  80. this.similarities = similarities;
  81. this.size = nodes.reduce((size, node) => size + node.size, 0);
  82. /** @type {string} */
  83. this.key = undefined;
  84. }
  85. }
  86. /**
  87. * @template T
  88. * @typedef {Object} GroupedItems<T>
  89. * @property {string} key
  90. * @property {T[]} items
  91. * @property {number} size
  92. */
  93. /**
  94. * @template T
  95. * @typedef {Object} Options
  96. * @property {number} maxSize maximum size of a group
  97. * @property {number} minSize minimum size of a group (preferred over maximum size)
  98. * @property {Iterable<T>} items a list of items
  99. * @property {function(T): number} getSize function to get size of an item
  100. * @property {function(T): string} getKey function to get the key of an item
  101. */
  102. /**
  103. * @template T
  104. * @param {Options<T>} options options object
  105. * @returns {GroupedItems<T>[]} grouped items
  106. */
  107. module.exports = ({ maxSize, minSize, items, getSize, getKey }) => {
  108. /** @type {Group<T>[]} */
  109. const result = [];
  110. const nodes = Array.from(
  111. items,
  112. item => new Node(item, getKey(item), getSize(item))
  113. );
  114. /** @type {Node<T>[]} */
  115. const initialNodes = [];
  116. // return nodes bigger than maxSize directly as group
  117. for (const node of nodes) {
  118. if (node.size >= maxSize) {
  119. result.push(new Group([node], []));
  120. } else {
  121. initialNodes.push(node);
  122. }
  123. }
  124. if (initialNodes.length > 0) {
  125. // lexically ordering of keys
  126. initialNodes.sort((a, b) => {
  127. if (a.key < b.key) return -1;
  128. if (a.key > b.key) return 1;
  129. return 0;
  130. });
  131. // calculate similarities between lexically adjacent nodes
  132. /** @type {number[]} */
  133. const similarities = [];
  134. for (let i = 1; i < initialNodes.length; i++) {
  135. const a = initialNodes[i - 1];
  136. const b = initialNodes[i];
  137. similarities.push(similarity(a.key, b.key));
  138. }
  139. const queue = [new Group(initialNodes, similarities)];
  140. while (queue.length) {
  141. const group = queue.pop();
  142. // only groups bigger than maxSize need to be splitted
  143. if (group.size < maxSize) {
  144. result.push(group);
  145. continue;
  146. }
  147. // find unsplittable area from left and right
  148. // going minSize from left and right
  149. let left = 0;
  150. let leftSize = 0;
  151. while (leftSize < minSize) {
  152. leftSize += group.nodes[left].size;
  153. left++;
  154. }
  155. let right = group.nodes.length - 1;
  156. let rightSize = 0;
  157. while (rightSize < minSize) {
  158. rightSize += group.nodes[right].size;
  159. right--;
  160. }
  161. if (left - 1 > right) {
  162. // can't split group while holding minSize
  163. // because minSize is preferred of maxSize we return
  164. // the group here even while it's too big
  165. // To avoid this make sure maxSize > minSize * 3
  166. result.push(group);
  167. continue;
  168. }
  169. if (left <= right) {
  170. // when there is a area between left and right
  171. // we look for best split point
  172. // we split at the minimum similarity
  173. // here key space is separated the most
  174. let best = left - 1;
  175. let bestSimilarity = group.similarities[best];
  176. for (let i = left; i <= right; i++) {
  177. const similarity = group.similarities[i];
  178. if (similarity < bestSimilarity) {
  179. best = i;
  180. bestSimilarity = similarity;
  181. }
  182. }
  183. left = best + 1;
  184. right = best;
  185. }
  186. // create two new groups for left and right area
  187. // and queue them up
  188. const rightNodes = [group.nodes[right + 1]];
  189. /** @type {number[]} */
  190. const rightSimilaries = [];
  191. for (let i = right + 2; i < group.nodes.length; i++) {
  192. rightSimilaries.push(group.similarities[i - 1]);
  193. rightNodes.push(group.nodes[i]);
  194. }
  195. queue.push(new Group(rightNodes, rightSimilaries));
  196. const leftNodes = [group.nodes[0]];
  197. /** @type {number[]} */
  198. const leftSimilaries = [];
  199. for (let i = 1; i < left; i++) {
  200. leftSimilaries.push(group.similarities[i - 1]);
  201. leftNodes.push(group.nodes[i]);
  202. }
  203. queue.push(new Group(leftNodes, leftSimilaries));
  204. }
  205. }
  206. // lexically ordering
  207. result.sort((a, b) => {
  208. if (a.nodes[0].key < b.nodes[0].key) return -1;
  209. if (a.nodes[0].key > b.nodes[0].key) return 1;
  210. return 0;
  211. });
  212. // give every group a name
  213. for (let i = 0; i < result.length; i++) {
  214. const group = result[i];
  215. const first = group.nodes[0];
  216. const last = group.nodes[group.nodes.length - 1];
  217. let name = getName(first.key, last.key);
  218. group.key = name;
  219. }
  220. // return the results
  221. return result.map(group => {
  222. /** @type {GroupedItems} */
  223. return {
  224. key: group.key,
  225. items: group.nodes.map(node => node.item),
  226. size: group.size
  227. };
  228. });
  229. };