Session 4
Huffman's Coding Algorithm
Huffman Coding is a popular greedy algorithm used for 8. It is particularly effective for sequences where certain characters appear more frequently than others.
Note
How the Algorithm Works
The core idea behind Huffman Coding is to assign variable-length binary codes to characters based on their frequency:
- Frequency Analysis: The algorithm counts how many times each character occurs in the dataset.
- Tree Construction: It builds a binary tree (the Huffman Tree) from the bottom up, starting with the least frequent characters.
- Binary Encoding: In this tree, more frequent characters are placed closer to the root, resulting in shorter binary strings. Conversely, rarer characters receive longer codes.
Key Benefits
- Significant Reduction: Huffman algorithms can typically compress data size by 70–80%, depending on the redundancy of the source material.
- Optimal Prefix Codes: It ensures that no code is a prefix of another, which prevents ambiguity during the decompression (decoding) process.
- Efficiency: By utilizing a greedy approach, the algorithm consistently finds the most mathematically optimal way to represent characters as binary strings for a given frequency set.
Objective
The main objectives of this session are to:
- Implement Huffman’s Coding algorithm.
- Test the implementation on different problem instances
- Construct the optimal binary prefix code
Question 1
Problem Statement
Apply Huffman’s algorithm to construct an optimal binary prefix code for the letters and its frequencies in the following table.
| Letters | A | B | I | M | S | X | Z |
|---|---|---|---|---|---|---|---|
| Frequency | 10 | 7 | 15 | 8 | 10 | 5 | 2 |
Show the complete steps.
Explanation
Using your data: A(10), B(7), I(15), M(8), S(10), X(5), Z(2).
- Initialize: Create a leaf node for each character.
- Merge (Greedy Step): Pick the two smallest:
Z(2)andX(5).- Create a parent node with frequency
2 + 5 = 7.
- Create a parent node with frequency
- Repeat: Now pick the next two smallest from the remaining list. This continues until only one node (the Root) remains.
- Assign Bits: Starting from the Root, assign
0to every left branch and1to every right branch.
Implementation
Python
import heapq
class Node:
def __init__(self, char, freq):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __lt__(self, other):
return self.freq < other.freq
def solve_huffman():
# 1. Initialize data from the problem table
data = {'A': 10, 'B': 7, 'I': 15, 'M': 8, 'S': 10, 'X': 5, 'Z': 2}
heap = [Node(c, f) for c, f in data.items()]
heapq.heapify(heap)
print("--- STEP-BY-STEP MERGING (GREEDY STEPS) ---")
step = 1
while len(heap) > 1:
# Extract two smallest frequencies
n1 = heapq.heappop(heap)
n2 = heapq.heappop(heap)
# Merge them
merged = Node(None, n1.freq + n2.freq)
merged.left = n1
merged.right = n2
print(f"Step {step}: Merged {n1.char or 'Node'}({n1.freq}) and "
f"{n2.char or 'Node'}({n2.freq}) -> New Node({merged.freq})")
heapq.heappush(heap, merged)
step += 1
# 2. Generate Codes and Print Table
root = heap[0]
codes = {}
def generate_codes(node, current_code):
if node:
if node.char:
codes[node.char] = current_code
generate_codes(node.left, current_code + "0")
generate_codes(node.right, current_code + "1")
generate_codes(root, "")
print("\n--- FINAL OPTIMAL BINARY PREFIX CODES ---")
print(f"{'Letter':<10} | {'Frequency':<10} | {'Huffman Code':<15}")
print("-" * 40)
for char in sorted(data.keys()):
print(f"{char:<10} | {data[char]:<10} | {codes[char]:<15}")
solve_huffman()
C Language
#include <stdio.h>
#include <stdlib.h>
struct Node {
char data;
int freq;
struct Node *left, *right;
};
struct Node* createNode(char data, int freq) {
struct Node* n = (struct Node*)malloc(sizeof(struct Node));
n->data = data; n->freq = freq;
n->left = n->right = NULL;
return n;
}
void printCodes(struct Node* root, int arr[], int top) {
if (root->left) { arr[top] = 0; printCodes(root->left, arr, top + 1); }
if (root->right) { arr[top] = 1; printCodes(root->right, arr, top + 1); }
if (!root->left && !root->right) {
printf("%-7c | %-7d | ", root->data, root->freq);
for (int i = 0; i < top; i++) printf("%d", arr[i]);
printf("\n");
}
}
int main() {
char letters[] = {'A', 'B', 'I', 'M', 'S', 'X', 'Z'};
int freqs[] = {10, 7, 15, 8, 10, 5, 2};
struct Node* nodes[7];
for (int i = 0; i < 7; i++) nodes[i] = createNode(letters[i], freqs[i]);
printf("--- STEP-BY-STEP MERGING ---\n");
int size = 7;
while (size > 1) {
int m1 = 0, m2 = 1;
if (nodes[m1]->freq > nodes[m2]->freq) { int t = m1; m1 = m2; m2 = t; }
for (int i = 2; i < size; i++) {
if (nodes[i]->freq < nodes[m1]->freq) { m2 = m1; m1 = i; }
else if (nodes[i]->freq < nodes[m2]->freq) { m2 = i; }
}
struct Node* parent = createNode('$', nodes[m1]->freq + nodes[m2]->freq);
parent->left = nodes[m1]; parent->right = nodes[m2];
printf("Merged %d + %d = %d\n", nodes[m1]->freq, nodes[m2]->freq, parent->freq);
nodes[m1] = parent;
nodes[m2] = nodes[size - 1];
size--;
}
printf("\n--- FINAL TABLE ---\n%-7s | %-7s | %-12s\n", "Letter", "Freq", "Code");
printf("-----------------------------\n");
int path[10];
printCodes(nodes[0], path, 0);
return 0;
}
Rust
use std::cmp::Reverse;
use std::collections::BinaryHeap;
#[derive(Eq, PartialEq)]
struct Node {
freq: i32,
char: Option<char>,
left: Option<Box<Node>>,
right: Option<Box<Node>>,
}
impl Ord for Node {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.freq.cmp(&other.freq)
}
}
impl PartialOrd for Node {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
fn traverse(node: &Node, code: String) {
if let Some(c) = node.char {
println!("{:<7} | {:<7} | {}", c, node.freq, code);
} else {
if let Some(ref l) = node.left {
traverse(l, code.clone() + "0");
}
if let Some(ref r) = node.right {
traverse(r, code + "1");
}
}
}
fn main() {
let mut heap = BinaryHeap::new();
let items = vec![
('A', 10),
('B', 7),
('I', 15),
('M', 8),
('S', 10),
('X', 5),
('Z', 2),
];
for (c, f) in items {
heap.push(Reverse(Node {
freq: f,
char: Some(c),
left: None,
right: None,
}));
}
println!("--- MERGING STEPS ---");
while heap.len() > 1 {
let Reverse(n1) = heap.pop().unwrap();
let Reverse(n2) = heap.pop().unwrap();
let parent = Node {
freq: n1.freq + n2.freq,
char: None,
left: Some(Box::new(n1)),
right: Some(Box::new(n2)),
};
println!(
"Merged: {} + {} = {}",
parent.left.as_ref().unwrap().freq,
parent.right.as_ref().unwrap().freq,
parent.freq
);
heap.push(Reverse(parent));
}
let Reverse(root) = heap.pop().unwrap();
println!(
"\n--- FINAL TABLE ---\n{:<7} | {:<7} | {}",
"Letter", "Freq", "Code"
);
println!("-----------------------------");
traverse(&root, "".to_string());
}
Sample Output
--- STEP-BY-STEP MERGING (GREEDY STEPS) ---
Step 1: Merged Z(2) and X(5) -> New Node(7)
Step 2: Merged B(7) and Node(7) -> New Node(14)
Step 3: Merged M(8) and A(10) -> New Node(18)
Step 4: Merged S(10) and Node(14) -> New Node(24)
Step 5: Merged I(15) and Node(18) -> New Node(33)
Step 6: Merged Node(24) and Node(33) -> New Node(57)
--- FINAL OPTIMAL BINARY PREFIX CODES ---
Letter | Frequency | Huffman Code
----------------------------------------
A | 10 | 111
B | 7 | 010
I | 15 | 10
M | 8 | 110
S | 10 | 00
X | 5 | 0111
Z | 2 | 0110
Question 2
Problem Statement
Implement Huffman's coding algorithm and run it on the problem instance of Q1.
Implementation
Implementation
Question 3
Problem Statement
What is an optimal binary tree and Huffman code for the following set of frequencies? Find out the average number of bits required per character. Show complete steps.
A:15 B:25 C:5 D:7 E:10 F:13 G:9
Initial Frequencies - Sorted
| Char | C | D | G | E | F | A | B |
|---|---|---|---|---|---|---|---|
| Freq | 5 | 7 | 9 | 10 | 13 | 15 | 25 |
Total Frequency = 84
Huffman Tree
| Step | Nodes in Pool (Sorted) | Merged Nodes | New Node |
|---|---|---|---|
| 1 | C:5, D:7, G:9, E:10, F:13, A:15, B:25 | C(5) + D(7) | N1:12 |
| 2 | N1:12, G:9, E:10, F:13, A:15, B:25 → Sort: G:9, E:10, N1:12... | G(9) + E(10) | N2:19 |
| 3 | N1:12, F:13, A:15, N2:19, B:25 | N1(12)+F(13) | N3:25 |
| 4 | A:15, N2:19, N3:25, B:25 | A(15)+N2(19) | N4:34 |
| 5 | N3:25, B:25, N4:34 | N3(25)+B(25) | N5:50 |
| 6 | N4:34, N5:50 | N4(34)+N5(50) | Root:84 |
Assigning Codes
Convention: 0 = Left Child, 1 = Right Child
Root(84)
├─ 0 → N4(34)
│ ├─ 0 → A(15) → 00
│ └─ 1 → N2(19)
│ ├─ 0 → G(9) → 010
│ └─ 1 → E(10) → 011
└─ 1 → N5(50)
├─ 0 → N3(25)
│ ├─ 0 → N1(12)
│ │ ├─ 0 → C(5) → 1000
│ │ └─ 1 → D(7) → 1001
│ └─ 1 → F(13) → 101
└─ 1 → B(25) → 11
Huffman Codes
| Char | Frequency | Code | Length |
|---|---|---|---|
| A | 15 | 00 | 2 |
| B | 25 | 11 | 2 |
| C | 5 | 1000 | 4 |
| D | 7 | 1001 | 4 |
| E | 10 | 011 | 3 |
| F | 13 | 101 | 3 |
| G | 9 | 010 | 3 |
Bits/Character
Total Bits = Σ (Frequency × Code Length)
= (15×2) + (25×2) + (5×4) + (7×4) + (10×3) + (13×3) + (9×3)
= 30 + 50 + 20 + 28 + 30 + 39 + 27 = 224
Average Bits = Total Bits / Total Frequency
= 224 / 84 ≈ 2.67 bits/character
Implementations
Python
import heapq
class Node:
def __init__(self, freq, char=None):
self.freq = freq
self.char = char
self.left = None
self.right = None
# Required for heapq to compare nodes by frequency
def __lt__(self, other):
return self.freq < other.freq
def build_huffman_tree(freq_dict):
"""Builds Huffman Tree using a Min-Heap"""
heap = [Node(f, c) for c, f in freq_dict.items()]
heapq.heapify(heap)
while len(heap) > 1:
left = heapq.heappop(heap)
right = heapq.heappop(heap)
merged = Node(left.freq + right.freq)
merged.left = left
merged.right = right
heapq.heappush(heap, merged)
return heap[0] if heap else None
def generate_codes(root, current_code="", codes=None):
"""DFS traversal to assign 0/1 codes"""
if codes is None:
codes = {}
if root is None:
return
if root.char is not None: # Leaf node
codes[root.char] = current_code
return
generate_codes(root.left, current_code + "0", codes)
generate_codes(root.right, current_code + "1", codes)
return codes
def main():
freq = {'A': 15, 'B': 25, 'C': 5, 'D': 7, 'E': 10, 'F': 13, 'G': 9}
root = build_huffman_tree(freq)
codes = generate_codes(root)
# Calculate average bits
total_bits = sum(freq[c] * len(codes[c]) for c in codes)
total_freq = sum(freq.values())
avg_bits = total_bits / total_freq
print("Huffman Codes:")
for char in sorted(codes.keys()):
print(f"{char}: {codes[char]}")
print(f"\nAverage bits per character: {avg_bits:.2f}")
if __name__ == "__main__":
main()
C Language
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct Node {
int freq;
char data;
struct Node *left, *right;
} Node;
// Simulated Priority Queue
Node* pq[20];
int pq_size = 0;
int active_nodes = 0;
Node* newNode(int freq, char data) {
Node* n = (Node*)malloc(sizeof(Node));
n->freq = freq;
n->data = data;
n->left = n->right = NULL;
pq[pq_size++] = n;
active_nodes++;
return n;
}
// Extract minimum frequency node
Node* extractMin() {
int minIdx = -1;
for (int i = 0; i < pq_size; i++) {
if (pq[i] != NULL) {
if (minIdx == -1 || pq[i]->freq < pq[minIdx]->freq)
minIdx = i;
}
}
if (minIdx == -1) return NULL;
Node* minNode = pq[minIdx];
pq[minIdx] = NULL;
active_nodes--;
return minNode;
}
void printCodes(Node* root, char arr[], int top) {
if (root->left) {
arr[top] = '0';
printCodes(root->left, arr, top + 1);
}
if (root->right) {
arr[top] = '1';
printCodes(root->right, arr, top + 1);
}
if (root->data != '