I have altered a Huffman C++ code that I found, specifically the decoding part. It runs magnitudes of order faster, especially noticeable for large input. Are there any other optimizations to be made?
https://github.com/sjhalayka/qc_for_cs/blob/main/pd_10_3_2.cpp
bool get_decoded_vector(const string& encoded_string, vector<T>& decoded_vector)
{
decoded_vector.clear();
if (huffman_codes.size() == 0 || encoded_string == "")
return false;
if (huffman_codes.size() == 1)
{
const T c = huffman_codes.begin()->first; // should be '0'
for (size_t i = 0; i < encoded_string.size(); i++)
decoded_vector.push_back(c);
return true;
}
// Get the minimum and maximum token size
size_t min_bits = static_cast<size_t>(-1); // Casting the number -1 transforms it into the largest possible value that can be held by a size_t
size_t max_bits = 0;
for (const auto pair : huffman_codes)
{
if (pair.second.size() < min_bits)
min_bits = pair.second.size();
if (pair.second.size() > max_bits)
max_bits = pair.second.size();
}
const size_t encoded_len = encoded_string.length();
// Use a sliding window of variable length
size_t begin_index = 0; // Start at the very beginning of the string
size_t len = min_bits; // Don't waste time by starting at len = 1 if it's not necessary
// While stuff to parse
while (is_valid_window(encoded_len, begin_index, len))
{
if (len > max_bits)
{
// No match was found up to here, which is exceptionally weird
return false;
}
// Match token with map element
const string token = encoded_string.substr(begin_index, len);
bool found_token = false;
for (const auto pair : huffman_codes)
{
if (pair.second == token)
{
decoded_vector.push_back(pair.first);
found_token = true;
break;
}
}
if (found_token)
{
// Slide window by token size number of steps,
// then reset window length to the minimum
begin_index += token.size();
len = min_bits;
}
else
{
// Expand window length by 1
len++;
}
}
return true;
}