原文: https://www.programiz.com/dsa/huffman-coding

在本教程中,您将学习霍夫曼编码的工作原理。 此外,您还将找到使用 C,C++ ,Java 和 Python 进行霍夫曼编码的工作示例。

霍夫曼编码是一种压缩数据以减小其大小而又不丢失任何细节的技术。 它最早由 David Huffman 开发。

霍夫曼编码通常可用于压缩其中经常出现字符的数据。


霍夫曼编码的工作原理?

假设下面的字符串要通过网络发送。

霍夫曼编码 - 图1

初始字符串

每个字符占用 8 位。 上面的字符串中总共有 15 个字符。 因此,总共需要8 * 15 = 120位来发送此字符串。

使用霍夫曼编码技术,我们可以将字符串压缩为较小的大小。

霍夫曼编码首先使用字符的频率创建一棵树,然后为每个字符生成代码。

一旦数据被编码,就必须被解码。 解码是使用同一棵树完成的。

霍夫曼编码使用前缀代码的概念,可以防止解码过程中出现歧义。 与字符关联的代码不应出现在任何其他代码的前缀中。 上面创建的树有助于维护属性。

霍夫曼编码是通过以下步骤完成的。

  1. 计算字符串中每个字符的频率。
    霍夫曼编码 - 图2
    字符串的频率

  2. 按频率升序对字符进行排序。 这些存储在优先队列Q中。
    霍夫曼编码 - 图3
    根据频率
    排序的字符

  3. 使每个唯一字符作为叶节点。

  4. 创建一个空节点z。 将最小频率分配给z的左子代,并将第二个最小频率分配给z的右代。 将z的值设置为上述两个最小频率的总和。
    霍夫曼编码 - 图4
    取得最小数的总和

  5. Q中删除这两个最小频率,并将总和添加到频率列表中(*表示上图中的内部节点)。

  6. 将节点z插入树中。

  7. 对所有字符重复步骤 3 到 5。
    霍夫曼编码 - 图5
    对所有字符重复步骤 3 至 5。
    Â
    霍夫曼编码 - 图6
    对所有字符重复步骤 3 至 5。

  8. 对于每个非叶节点,将 0 分配给左边,将 1 分配给右边。
    霍夫曼编码 - 图7
    将 0 分配给左边,将 1 分配给右边

为了通过网络发送上面的字符串,我们必须发送树以及上面的压缩代码。 总大小如下表所示。

字符 频率 编码 大小
a 5 11 5 * 2 = 10
b 1 100 1 * 3 = 3
c 6 0 6 * 1 = 6
d 3 101 3 * 3 = 9
4 * 8 = 32 15 位 28 位

如果不进行编码,则字符串的总大小为 120 位。 编码后,大小减小为32 + 15 + 28 = 75


解码代码

为了解码代码,我们可以获取代码并遍历树以找到字符。

假设要解码 101,我们可以如下图所示从根开始遍历。

霍夫曼编码 - 图8

解码


霍夫曼编码算法

  1. create a priority queue Q consisting of each unique character.
  2. sort then in ascending order of their frequencies.
  3. for all the unique characters:
  4. create a newNode
  5. extract minimum value from Q and assign it to leftChild of newNode
  6. extract minimum value from Q and assign it to rightChild of newNode
  7. calculate the sum of these two minimum values and assign it to the value of newNode
  8. insert this newNode into the tree
  9. return rootNode

Python,Java 和 C/C++ 示例

  1. # Huffman Coding in python
  2. string = 'BCAADDDCCACACAC'
  3. # Creating tree nodes
  4. class NodeTree(object):
  5. def __init__(self, left=None, right=None):
  6. self.left = left
  7. self.right = right
  8. def children(self):
  9. return (self.left, self.right)
  10. def nodes(self):
  11. return (self.left, self.right)
  12. def __str__(self):
  13. return '%s_%s' % (self.left, self.right)
  14. # Main function implementing huffman coding
  15. def huffman_code_tree(node, left=True, binString=''):
  16. if type(node) is str:
  17. return {node: binString}
  18. (l, r) = node.children()
  19. d = dict()
  20. d.update(huffman_code_tree(l, True, binString + '0'))
  21. d.update(huffman_code_tree(r, False, binString + '1'))
  22. return d
  23. # Calculating frequency
  24. freq = {}
  25. for c in string:
  26. if c in freq:
  27. freq[c] += 1
  28. else:
  29. freq[c] = 1
  30. freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
  31. nodes = freq
  32. while len(nodes) > 1:
  33. (key1, c1) = nodes[-1]
  34. (key2, c2) = nodes[-2]
  35. nodes = nodes[:-2]
  36. node = NodeTree(key1, key2)
  37. nodes.append((node, c1 + c2))
  38. nodes = sorted(nodes, key=lambda x: x[1], reverse=True)
  39. huffmanCode = huffman_code_tree(nodes[0][0])
  40. print(' Char | Huffman code ')
  41. print('----------------------')
  42. for (char, frequency) in freq:
  43. print(' %-4r |%12s' % (char, huffmanCode[char]))
  1. // Huffman Coding in Java
  2. import java.util.PriorityQueue;
  3. import java.util.Comparator;
  4. class HuffmanNode {
  5. int item;
  6. char c;
  7. HuffmanNode left;
  8. HuffmanNode right;
  9. }
  10. // For comparing the nodes
  11. class ImplementComparator implements Comparator<HuffmanNode> {
  12. public int compare(HuffmanNode x, HuffmanNode y) {
  13. return x.item - y.item;
  14. }
  15. }
  16. // IMplementing the huffman algorithm
  17. public class Huffman {
  18. public static void printCode(HuffmanNode root, String s) {
  19. if (root.left == null && root.right == null && Character.isLetter(root.c)) {
  20. System.out.println(root.c + " | " + s);
  21. return;
  22. }
  23. printCode(root.left, s + "0");
  24. printCode(root.right, s + "1");
  25. }
  26. public static void main(String[] args) {
  27. int n = 4;
  28. char[] charArray = { 'A', 'B', 'C', 'D' };
  29. int[] charfreq = { 5, 1, 6, 3 };
  30. PriorityQueue<HuffmanNode> q = new PriorityQueue<HuffmanNode>(n, new ImplementComparator());
  31. for (int i = 0; i < n; i++) {
  32. HuffmanNode hn = new HuffmanNode();
  33. hn.c = charArray[i];
  34. hn.item = charfreq[i];
  35. hn.left = null;
  36. hn.right = null;
  37. q.add(hn);
  38. }
  39. HuffmanNode root = null;
  40. while (q.size() > 1) {
  41. HuffmanNode x = q.peek();
  42. q.poll();
  43. HuffmanNode y = q.peek();
  44. q.poll();
  45. HuffmanNode f = new HuffmanNode();
  46. f.item = x.item + y.item;
  47. f.c = '-';
  48. f.left = x;
  49. f.right = y;
  50. root = f;
  51. q.add(f);
  52. }
  53. System.out.println(" Char | Huffman code ");
  54. System.out.println("--------------------");
  55. printCode(root, "");
  56. }
  57. }
  1. // Huffman Coding in C
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #define MAX_TREE_HT 50
  5. struct MinHNode {
  6. char item;
  7. unsigned freq;
  8. struct MinHNode *left, *right;
  9. };
  10. struct MinHeap {
  11. unsigned size;
  12. unsigned capacity;
  13. struct MinHNode **array;
  14. };
  15. // Create nodes
  16. struct MinHNode *newNode(char item, unsigned freq) {
  17. struct MinHNode *temp = (struct MinHNode *)malloc(sizeof(struct MinHNode));
  18. temp->left = temp->right = NULL;
  19. temp->item = item;
  20. temp->freq = freq;
  21. return temp;
  22. }
  23. // Create min heap
  24. struct MinHeap *createMinH(unsigned capacity) {
  25. struct MinHeap *minHeap = (struct MinHeap *)malloc(sizeof(struct MinHeap));
  26. minHeap->size = 0;
  27. minHeap->capacity = capacity;
  28. minHeap->array = (struct MinHNode **)malloc(minHeap->capacity * sizeof(struct MinHNode *));
  29. return minHeap;
  30. }
  31. // Function to swap
  32. void swapMinHNode(struct MinHNode **a, struct MinHNode **b) {
  33. struct MinHNode *t = *a;
  34. *a = *b;
  35. *b = t;
  36. }
  37. // Heapify
  38. void minHeapify(struct MinHeap *minHeap, int idx) {
  39. int smallest = idx;
  40. int left = 2 * idx + 1;
  41. int right = 2 * idx + 2;
  42. if (left < minHeap->size && minHeap->array[left]->freq < minHeap->array[smallest]->freq)
  43. smallest = left;
  44. if (right < minHeap->size && minHeap->array[right]->freq < minHeap->array[smallest]->freq)
  45. smallest = right;
  46. if (smallest != idx) {
  47. swapMinHNode(&minHeap->array[smallest], &minHeap->array[idx]);
  48. minHeapify(minHeap, smallest);
  49. }
  50. }
  51. // Check if size if 1
  52. int checkSizeOne(struct MinHeap *minHeap) {
  53. return (minHeap->size == 1);
  54. }
  55. // Extract min
  56. struct MinHNode *extractMin(struct MinHeap *minHeap) {
  57. struct MinHNode *temp = minHeap->array[0];
  58. minHeap->array[0] = minHeap->array[minHeap->size - 1];
  59. --minHeap->size;
  60. minHeapify(minHeap, 0);
  61. return temp;
  62. }
  63. // Insertion function
  64. void insertMinHeap(struct MinHeap *minHeap, struct MinHNode *minHeapNode) {
  65. ++minHeap->size;
  66. int i = minHeap->size - 1;
  67. while (i && minHeapNode->freq < minHeap->array[(i - 1) / 2]->freq) {
  68. minHeap->array[i] = minHeap->array[(i - 1) / 2];
  69. i = (i - 1) / 2;
  70. }
  71. minHeap->array[i] = minHeapNode;
  72. }
  73. void buildMinHeap(struct MinHeap *minHeap) {
  74. int n = minHeap->size - 1;
  75. int i;
  76. for (i = (n - 1) / 2; i >= 0; --i)
  77. minHeapify(minHeap, i);
  78. }
  79. int isLeaf(struct MinHNode *root) {
  80. return !(root->left) && !(root->right);
  81. }
  82. struct MinHeap *createAndBuildMinHeap(char item[], int freq[], int size) {
  83. struct MinHeap *minHeap = createMinH(size);
  84. for (int i = 0; i < size; ++i)
  85. minHeap->array[i] = newNode(item[i], freq[i]);
  86. minHeap->size = size;
  87. buildMinHeap(minHeap);
  88. return minHeap;
  89. }
  90. struct MinHNode *buildHuffmanTree(char item[], int freq[], int size) {
  91. struct MinHNode *left, *right, *top;
  92. struct MinHeap *minHeap = createAndBuildMinHeap(item, freq, size);
  93. while (!checkSizeOne(minHeap)) {
  94. left = extractMin(minHeap);
  95. right = extractMin(minHeap);
  96. top = newNode('$', left->freq + right->freq);
  97. top->left = left;
  98. top->right = right;
  99. insertMinHeap(minHeap, top);
  100. }
  101. return extractMin(minHeap);
  102. }
  103. void printHCodes(struct MinHNode *root, int arr[], int top) {
  104. if (root->left) {
  105. arr[top] = 0;
  106. printHCodes(root->left, arr, top + 1);
  107. }
  108. if (root->right) {
  109. arr[top] = 1;
  110. printHCodes(root->right, arr, top + 1);
  111. }
  112. if (isLeaf(root)) {
  113. printf(" %c | ", root->item);
  114. printArray(arr, top);
  115. }
  116. }
  117. // Wrapper function
  118. void HuffmanCodes(char item[], int freq[], int size) {
  119. struct MinHNode *root = buildHuffmanTree(item, freq, size);
  120. int arr[MAX_TREE_HT], top = 0;
  121. printHCodes(root, arr, top);
  122. }
  123. // Print the array
  124. void printArray(int arr[], int n) {
  125. int i;
  126. for (i = 0; i < n; ++i)
  127. printf("%d", arr[i]);
  128. printf("\n");
  129. }
  130. int main() {
  131. char arr[] = {'A', 'B', 'C', 'D'};
  132. int freq[] = {5, 1, 6, 3};
  133. int size = sizeof(arr) / sizeof(arr[0]);
  134. printf(" Char | Huffman code ");
  135. printf("\n--------------------\n");
  136. HuffmanCodes(arr, freq, size);
  137. }
  1. // Huffman Coding in C++
  2. #include <iostream>
  3. using namespace std;
  4. #define MAX_TREE_HT 50
  5. struct MinHNode {
  6. unsigned freq;
  7. char item;
  8. struct MinHNode *left, *right;
  9. };
  10. struct MinH {
  11. unsigned size;
  12. unsigned capacity;
  13. struct MinHNode **array;
  14. };
  15. // Creating Huffman tree node
  16. struct MinHNode *newNode(char item, unsigned freq) {
  17. struct MinHNode *temp = (struct MinHNode *)malloc(sizeof(struct MinHNode));
  18. temp->left = temp->right = NULL;
  19. temp->item = item;
  20. temp->freq = freq;
  21. return temp;
  22. }
  23. // Create min heap using given capacity
  24. struct MinH *createMinH(unsigned capacity) {
  25. struct MinH *minHeap = (struct MinH *)malloc(sizeof(struct MinH));
  26. minHeap->size = 0;
  27. minHeap->capacity = capacity;
  28. minHeap->array = (struct MinHNode **)malloc(minHeap->capacity * sizeof(struct MinHNode *));
  29. return minHeap;
  30. }
  31. // Swap function
  32. void swapMinHNode(struct MinHNode **a, struct MinHNode **b) {
  33. struct MinHNode *t = *a;
  34. *a = *b;
  35. *b = t;
  36. }
  37. // Heapify
  38. void minHeapify(struct MinH *minHeap, int idx) {
  39. int smallest = idx;
  40. int left = 2 * idx + 1;
  41. int right = 2 * idx + 2;
  42. if (left < minHeap->size && minHeap->array[left]->freq < minHeap->array[smallest]->freq)
  43. smallest = left;
  44. if (right < minHeap->size && minHeap->array[right]->freq < minHeap->array[smallest]->freq)
  45. smallest = right;
  46. if (smallest != idx) {
  47. swapMinHNode(&minHeap->array[smallest],
  48. &minHeap->array[idx]);
  49. minHeapify(minHeap, smallest);
  50. }
  51. }
  52. // Check if size if 1
  53. int checkSizeOne(struct MinH *minHeap) {
  54. return (minHeap->size == 1);
  55. }
  56. // Extract the min
  57. struct MinHNode *extractMin(struct MinH *minHeap) {
  58. struct MinHNode *temp = minHeap->array[0];
  59. minHeap->array[0] = minHeap->array[minHeap->size - 1];
  60. --minHeap->size;
  61. minHeapify(minHeap, 0);
  62. return temp;
  63. }
  64. // Insertion
  65. void insertMinHeap(struct MinH *minHeap, struct MinHNode *minHeapNode) {
  66. ++minHeap->size;
  67. int i = minHeap->size - 1;
  68. while (i && minHeapNode->freq < minHeap->array[(i - 1) / 2]->freq) {
  69. minHeap->array[i] = minHeap->array[(i - 1) / 2];
  70. i = (i - 1) / 2;
  71. }
  72. minHeap->array[i] = minHeapNode;
  73. }
  74. // BUild min heap
  75. void buildMinHeap(struct MinH *minHeap) {
  76. int n = minHeap->size - 1;
  77. int i;
  78. for (i = (n - 1) / 2; i >= 0; --i)
  79. minHeapify(minHeap, i);
  80. }
  81. int isLeaf(struct MinHNode *root) {
  82. return !(root->left) && !(root->right);
  83. }
  84. struct MinH *createAndBuildMinHeap(char item[], int freq[], int size) {
  85. struct MinH *minHeap = createMinH(size);
  86. for (int i = 0; i < size; ++i)
  87. minHeap->array[i] = newNode(item[i], freq[i]);
  88. minHeap->size = size;
  89. buildMinHeap(minHeap);
  90. return minHeap;
  91. }
  92. struct MinHNode *buildHfTree(char item[], int freq[], int size) {
  93. struct MinHNode *left, *right, *top;
  94. struct MinH *minHeap = createAndBuildMinHeap(item, freq, size);
  95. while (!checkSizeOne(minHeap)) {
  96. left = extractMin(minHeap);
  97. right = extractMin(minHeap);
  98. top = newNode('$', left->freq + right->freq);
  99. top->left = left;
  100. top->right = right;
  101. insertMinHeap(minHeap, top);
  102. }
  103. return extractMin(minHeap);
  104. }
  105. void printHCodes(struct MinHNode *root, int arr[], int top) {
  106. if (root->left) {
  107. arr[top] = 0;
  108. printHCodes(root->left, arr, top + 1);
  109. }
  110. if (root->right) {
  111. arr[top] = 1;
  112. printHCodes(root->right, arr, top + 1);
  113. }
  114. if (isLeaf(root)) {
  115. cout << root->item << " | ";
  116. printArray(arr, top);
  117. }
  118. }
  119. // Wrapper function
  120. void HuffmanCodes(char item[], int freq[], int size) {
  121. struct MinHNode *root = buildHfTree(item, freq, size);
  122. int arr[MAX_TREE_HT], top = 0;
  123. printHCodes(root, arr, top);
  124. }
  125. // Print the array
  126. void printArray(int arr[], int n) {
  127. int i;
  128. for (i = 0; i < n; ++i)
  129. cout << arr[i];
  130. cout << "\n";
  131. }
  132. int main() {
  133. char arr[] = {'A', 'B', 'C', 'D'};
  134. int freq[] = {5, 1, 6, 3};
  135. int size = sizeof(arr) / sizeof(arr[0]);
  136. cout << "Char | Huffman code ";
  137. cout << "\n----------------------\n";
  138. HuffmanCodes(arr, freq, size);
  139. }

霍夫曼编码复杂度

基于每个唯一字符的频率进行编码的时间复杂度为O(nlog n)

从优先队列中提取最小频率发生2*(n-1)次,其复杂度为O(log n)。 因此,总体复杂度为O(nlog n)


霍夫曼编码应用

  • 霍夫曼编码用于常规压缩格式,例如 GZIP,BZIP2,PKZIP 等。
  • 用于文本和传真传输。