Initial Commit

This commit is contained in:
eneller
2026-02-12 01:17:45 +01:00
commit 08e7ab4681
9 changed files with 1424 additions and 0 deletions

336
.gitignore vendored Normal file
View File

@@ -0,0 +1,336 @@
# Created by https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode
# Edit at https://www.toptal.com/developers/gitignore?templates=latex,visualstudiocode
### LaTeX ###
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.log
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb
## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
*.pdf
## Generated if empty string is given at "Please type another file name for output:"
.pdf
## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml
## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync
## Build tool directories for auxiliary files
# latexrun
latex.out/
## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa
# achemso
acs-*.bib
# amsthm
*.thm
# beamer
*.nav
*.pre
*.snm
*.vrb
# changes
*.soc
# comment
*.cut
# cprotect
*.cpt
# elsarticle (documentclass of Elsevier journals)
*.spl
# endnotes
*.ent
# fixme
*.lox
# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm
#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R
# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs
*.slg
*.slo
*.sls
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist
# gnuplot
*.gnuplot
*.table
# gnuplottex
*-gnuplottex-*
# gregoriotex
*.gaux
*.glog
*.gtex
# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref
# hyperref
*.brf
# knitr
*-concordance.tex
# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
# *.tikz
*-tikzDictionary
# listings
*.lol
# luatexja-ruby
*.ltjruby
# makeidx
*.idx
*.ilg
*.ind
# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*
# minted
_minted*
*.pyg
# morewrites
*.mw
# newpax
*.newpax
# nomencl
*.nlg
*.nlo
*.nls
# pax
*.pax
# pdfpcnotes
*.pdfpc
# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd
# scrwfile
*.wrt
# svg
svg-inkscape/
# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/
# pdfcomment
*.upa
*.upb
# pythontex
*.pytxcode
pythontex-files-*/
# tcolorbox
*.listing
# thmtools
*.loe
# TikZ & PGF
*.dpth
*.md5
*.auxlock
# titletoc
*.ptc
# todonotes
*.tdo
# vhistory
*.hst
*.ver
# easy-todo
*.lod
# xcolor
*.xcp
# xmpincl
*.xmpi
# xindy
*.xdy
# xypic precompiled matrices and outlines
*.xyc
*.xyd
# endfloat
*.ttt
*.fff
# Latexian
TSWLatexianTemp*
## Editors:
# WinEdt
*.bak
*.sav
# Texpad
.texpadtmp
# LyX
*.lyx~
# Kile
*.backup
# gummi
.*.swp
# KBibTeX
*~[0-9]*
# TeXnicCenter
*.tps
# auto folder when using emacs and auctex
./auto/*
*.el
# expex forward references with \gathertags
*-tags.tex
# standalone packages
*.sta
# Makeindex log files
*.lpz
# xwatermark package
*.xwm
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib
### LaTeX Patch ###
# LIPIcs / OASIcs
*.vtc
# glossaries
*.glstex
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode
bib
*.*-SAVE-ERROR
*.ist

5
.latexmkrc Normal file
View File

@@ -0,0 +1,5 @@
$latex = 'latex %O --shell-escape %S';
$pdflatex = 'pdflatex %O --shell-escape %S';
$pdf_mode = 1;
$clean_ext = "lol nav snm loa bbl* glo ist";
$bibtex_use = 2;

94
compression.bib Normal file
View File

@@ -0,0 +1,94 @@
@article{shannon1948mathematical,
title={A mathematical theory of communication},
author={Shannon, Claude E},
journal={The Bell system technical journal},
volume={27},
number={3},
pages={379--423},
year={1948},
publisher={Nokia Bell Labs}
}
@misc{ enwiki:shannon-source-coding,
author = "{Wikipedia contributors}",
title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440",
note = "[Online; accessed 25-November-2025]"
}
@misc{ enwiki:shannon-fano,
author = "{Wikipedia contributors}",
title = "ShannonFano coding --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Fano_coding&oldid=1315776380",
note = "[Online; accessed 26-November-2025]"
}
@misc{ enwiki:huffman-code,
author = "{Wikipedia contributors}",
title = "Huffman coding --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Huffman_coding&oldid=1321991625",
note = "[Online; accessed 26-November-2025]"
}
@misc{ enwiki:lzw,
author = "{Wikipedia contributors}",
title = "LempelZivWelch --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Lempel%E2%80%93Ziv%E2%80%93Welch&oldid=1307959679",
note = "[Online; accessed 26-November-2025]"
}
@misc{ enwiki:arithmetic-code,
author = "{Wikipedia contributors}",
title = "Arithmetic coding --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Arithmetic_coding&oldid=1320999535",
note = "[Online; accessed 26-November-2025]"
}
@misc{ enwiki:kraft-mcmillan,
author = "{Wikipedia contributors}",
title = "KraftMcMillan inequality --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Kraft%E2%80%93McMillan_inequality&oldid=1313803157",
note = "[Online; accessed 26-November-2025]"
}
@misc{ enwiki:partition,
author = "{Wikipedia contributors}",
title = "Partition problem --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Partition_problem&oldid=1320732818",
note = "[Online; accessed 30-November-2025]"
}
@misc{ dewiki:shannon-fano,
author = "Wikipedia",
title = "Shannon-Fano-Kodierung --- Wikipedia{,} die freie Enzyklopädie",
year = "2024",
url = "https://de.wikipedia.org/w/index.php?title=Shannon-Fano-Kodierung&oldid=246624798",
note = "[Online; Stand 26. November 2025]"
}
@misc{ dewiki:huffman-code,
author = "Wikipedia",
title = "Huffman-Kodierung --- Wikipedia{,} die freie Enzyklopädie",
year = "2025",
url = "https://de.wikipedia.org/w/index.php?title=Huffman-Kodierung&oldid=254369306",
note = "[Online; Stand 26. November 2025]"
}
@misc{ dewiki:lzw,
author = "Wikipedia",
title = "Lempel-Ziv-Welch-Algorithmus --- Wikipedia{,} die freie Enzyklopädie",
year = "2025",
url = "https://de.wikipedia.org/w/index.php?title=Lempel-Ziv-Welch-Algorithmus&oldid=251943809",
note = "[Online; Stand 26. November 2025]"
}
@misc{ dewiki:kraft-mcmillan,
author = "Wikipedia",
title = "Kraft-Ungleichung --- Wikipedia{,} die freie Enzyklopädie",
year = "2018",
url = "https://de.wikipedia.org/w/index.php?title=Kraft-Ungleichung&oldid=172862410",
note = "[Online; Stand 26. November 2025]"
}
@misc{ dewiki:partition,
author = "Wikipedia",
title = "Partitionsproblem --- Wikipedia{,} die freie Enzyklopädie",
year = "2025",
url = "https://de.wikipedia.org/w/index.php?title=Partitionsproblem&oldid=255787013",
note = "[Online; Stand 26. November 2025]"
}

332
compression.tex Normal file
View File

@@ -0,0 +1,332 @@
\documentclass{article}
%%% basic layouting
\usepackage[utf8x]{inputenc}
\usepackage[margin=1in]{geometry} % Adjust margins
\usepackage{caption}
\usepackage{hyperref}
\PassOptionsToPackage{hyphens}{url} % allow breaking urls
\usepackage{float}
\usepackage{wrapfig}
\usepackage{subcaption}
\usepackage{parskip} % dont indent after paragraphs, figures
\usepackage{xcolor}
%%% algorithms
\usepackage{algorithm}
\usepackage{algpseudocodex}
% graphs and plots
\usepackage{tikz}
\usepackage{pgfplots}
\usetikzlibrary{positioning}
\usetikzlibrary{trees}
%\usetikzlibrary{graphs, graphdrawing}
%%% math
\usepackage{amsmath}
%%% citations
\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex}
%\usepackage{csquotes} % Recommended for biblatex
\addbibresource{compression.bib}
\title{Compression}
\date{\today}
\begin{document}
\maketitle
\section{Introduction}
As the volume of data grows exponentially around the world, compression is only gaining in importance to all disciplines.
Not only does it enable the storage of large amounts of information needed for research in scientific domains
like DNA sequencing and analysis, it also plays a vital role in keeping stored data accessible by
facilitating cataloging, search and retrieval.
The concept of entropy introduced in the previous entry is closely related to the design of efficient codes for compression.
In coding theory, the events of an information source are to be encoded in a manner that minimizes the bits needed to store
the information provided by the source.
The process of encoding can thus be described by a function $C$ transforming from a source alphabet $X$ to a code alphabet $Y$.
Symbols in the alphabets are denominated $x_i$ and $y_j$ respectively, and have underlying probabilities $p_{i}$.
% TODO fix use of alphabet / symbol / code word: alphabet is usually binary -> code word is 010101
\begin{equation}
C: X \rightarrow Y \qquad X=\{x_1,x_2,...x_n\} \qquad Y=\{y_1,y_2,...y_m\}
\label{eq:formal-code}
\end{equation}
The understanding of entropy as the expected information $E(I)$ of a message provides an intuition that,
given a source with a given entropy (in bits), any coding can not have a lower average word length $l_j$ (in bits)
than this entropy without losing information.
\begin{equation}
H = E(I) = - \sum_i p_i \log_2(p_i) \quad \leq \quad E(L) = \sum_i p_j l_j
\label{eq:entropy-information}
\end{equation}
This is the content of Shannons's source coding theorem,
introduced in \citeyear{shannon1948mathematical}.
In his paper, \citeauthor{shannon1948mathematical} proposed two principal ideas to minimize the average length of a code.
The first is to use short codes for symbols with higher probability.
This is an intuitive approach as more frequent symbols have a higher impact on average code length.
The second idea is to encode events that frequently occur together at the same time, artificially increasing
the size of the code alphabet $Y$ to allow for greater flexibility in code design.\cite{enwiki:shannon-source-coding}
Codes can have several properties. A code where all codewords have equal lengths is called a \textit{block code}.
While easy to construct, they are not well suited for our goal of minimizing average word length
as specified in \autoref{eq:entropy-information} because the source alphabet is generally not equally distributed
in a way that $p_i = \frac{1}{n}$.
In order to send (or store, for that matter) multiple code words in succession, a code $Y$ has to be uniquely decodable.
When receiving 0010 in succesion using the nonsingular code $Y_2$ from \autoref{tab:code-properties},
it is not clear to the recipient which source symbols make up the intended message.
For the specified sequence, there are a total of three possibilities to decode the received code:
$s_0 s_3 s_0$, $s_0 s_0 s_1$ or $s_2 s_1$ could all be the intended message, making the code useless.
\begin{table}[H]
\centering
\begin{tabular}{c l l l}
Source Code $X$ & Prefix Code $Y_0$ & Suffix Code $Y_1$ & Nonsingular Code $Y_2$ \\
\hline
$s_0$ & 0 & 0 & 0 \\
$s_1$ & 10 & 01 & 10 \\
$s_2$ & 110 & 011 & 00 \\
$s_3$ & 1110 & 0111 & 01 \\
\end{tabular}
\caption{Examples of different properties of codes}
\label{tab:code-properties}
\end{table}
Another interesting property of a code that is specifically important for transmission but less so for storage, is
being prefix-free.
A prefix code (which is said to be prefix-free) can be decoded by the receiver of the symbol as soon as it is received
because no code word $y_j$ is the prefix of another valid code word.
As shown in \autoref{tab:code-properties} $Y_0$ is a prefix code, in this case more specifically called a \textit{comma code}
because each code word is separated by a trailing 0 from the next code word.
$Y_1$ in contrast is called a \textit{capital code} (capitalizes the beginning of each word) and is not a prefix code.
In the case of the capital code in fact every word other than the longest possible code word is a prefix of the longer words
lower in the table. As a result, the receiver cannot instantaneously decode each word but rather has to wait for the leading 0
of the next codeword.
Further, a code is said to be \textit{efficient} if it has the smallest possible average word length, i.e. matches
the entropy of the source alphabet.
\section{Kraft-McMillan inequality}
The Kraft-McMillan inequality gives a necessary and sufficient condition for the existence of a prefix code.
In the form shown in \autoref{eq:kraft-mcmillan} it is intuitive to understand given a code tree.
Because prefix codes require code words to only be situated on the leaves of a code tree,
for every code word $i$ using an alphabet of size $r$, it uses up exactly $r^{-l_i}$ of the available code words.
The sum over all of them can thus never be larger than one else
the code will not be uniquely decodable \cite{enwiki:kraft-mcmillan}.
\begin{equation}
\sum_l r^{-l_i} \leq 1
\label{eq:kraft-mcmillan}
\end{equation}
\section{Shannon-Fano}
Shannon-Fano coding is one of the earliest methods for constructing prefix codes.
It is a top-down method that divides symbols into equal groups based on their probabilities,
recursively partitioning them to assign shorter codewords to more frequent events.
\begin{algorithm}
\caption{Shannon-Fano compression}
\label{alg:shannon-fano}
\begin{algorithmic}
\Procedure{ShannonFano}{symbols, probabilities}
\If{length(symbols) $= 1$}
\State \Return codeword for single symbol
\EndIf
\State $\text{current\_sum} \gets 0$
\State $\text{split\_index} \gets 0$
\For{$i \gets 1$ \textbf{to} length(symbols)}
\If{$|\text{current\_sum} + \text{probabilities}[i] - 0.5| < |\text{current\_sum} - 0.5|$}
\State $\text{current\_sum} \gets \text{current\_sum} + \text{probabilities}[i]$
\State $\text{split\_index} \gets i$
\EndIf
\EndFor
\State $\text{left\_group} \gets \text{symbols}[1 : \text{split\_index}]$
\State $\text{right\_group} \gets \text{symbols}[\text{split\_index} + 1 : \text{length(symbols)}]$
\State Assign prefix ``0'' to codes from ShannonFano($\text{left\_group}, \ldots$)
\State Assign prefix ``1'' to codes from ShannonFano($\text{right\_group}, \ldots$)
\EndProcedure
\end{algorithmic}
\end{algorithm}
While Shannon-Fano coding guarantees the generation of a prefix-free code with an average word length close to the entropy,
it is not guaranteed to be optimal. In practice, it often generates codewords that are only slightly longer than necessary.
Weaknesses of the algorithm also include the non-trivial partitioning phase \cite{enwiki:partition}, which can in practice however be solved relatively efficiently.
Due to the aforementioned limitations, neither of the two historically slightly ambiguous Shannon-Fano algorithms are almost never used,
in favor of the Huffman coding as described in the next section.
\section{Huffman Coding}
\label{sec:huffman}
Huffman coding is an optimal prefix coding algorithm that minimizes the expected codeword length
for a given set of symbol probabilities. Developed by David Huffman in 1952, it guarantees optimality
by constructing a binary tree where the most frequent symbols are assigned the shortest codewords.
Huffman coding achieves the theoretical limit of entropy for discrete memoryless sources,
making it one of the most important compression techniques in information theory.
Unlike Shannon-Fano, which uses a top-down approach, Huffman coding employs a bottom-up strategy.
The algorithm builds the code tree by iteratively combining the two symbols with the lowest probabilities
into a new internal node. This greedy approach ensures that the resulting tree minimizes the weighted path length,
where the weight of each symbol is its probability.
\begin{algorithm}
\caption{Huffman coding algorithm}
\label{alg:huffman}
\begin{algorithmic}
\Procedure{Huffman}{symbols, probabilities}
\State Create a leaf node for each symbol and add it to a priority queue
\While{priority queue contains more than one node}
\State Extract two nodes with minimum frequency: $\text{left}$ and $\text{right}$
\State Create a new internal node with frequency $\text{freq(left)} + \text{freq(right)}$
\State Set $\text{left}$ as the left child and $\text{right}$ as the right child
\State Add the new internal node to the priority queue
\EndWhile
\State $\text{root} \gets$ remaining node in priority queue
\State Traverse tree and assign codewords: ``0'' for left edges, ``1'' for right edges
\State \Return codewords
\EndProcedure
\end{algorithmic}
\end{algorithm}
The optimality of Huffman coding can be proven by exchange arguments.
The key insight is that if two codewords have the maximum length in an optimal code, they must correspond to the two least frequent symbols.
Moreover, these two symbols can be combined into a single meta-symbol without affecting optimality,
which leads to a recursive structure that guarantees Huffman's method produces an optimal code.
The average codeword length $L_{\text{Huffman}}$ produced by Huffman coding satisfies the following bounds:
\begin{equation}
H(X) \leq L_{\text{Huffman}} < H(X) + 1
\label{eq:huffman-bounds}
\end{equation}
where $H(X)$ is the entropy of the source. This means Huffman coding is guaranteed to be within one bit
of the theoretical optimum. In practice, when symbol probabilities are powers of $\frac{1}{2}$,
Huffman coding achieves perfect compression and $L_{\text{Huffman}} = H(X)$.
The computational complexity of Huffman coding is $O(n \log n)$, where $n$ is the number of distinct symbols.
A priority queue implementation using a binary heap achieves this bound, making Huffman coding
efficient even for large alphabets. Its widespread use in compression formats such as DEFLATE, JPEG, and MP3
testifies to its practical importance.
However, Huffman coding has limitations. First, it requires knowledge of the probability distribution
of symbols before encoding, necessitating a preprocessing pass or transmission of frequency tables.
Second, it assigns an integer number of bits to each symbol, which can be suboptimal
when symbol probabilities do not align well with powers of two.
Symbol-by-symbol coding imposes a constraint that is often unneeded since codes will usually be packed in long sequences,
leaving room for further optimization as provided by Arithmetic Coding.
\section{Arithmetic Coding}
Arithmetic coding is a modern compression technique that encodes an entire message as a single interval
within the range $[0, 1)$, as opposed to symbol-by-symbol coding used by Huffman.
By iteratively refining this interval based on the probabilities of the symbols in the message,
arithmetic coding can achieve compression rates that approach the entropy of the source.
Its ability to handle non-integer bit lengths makes it particularly powerful
for applications requiring high compression efficiency.
In the basic form, a message is first written in the base of the alphabet with a leading '$0.$': $ \text{ABBCAB} = 0.011201_3$,
in this case yielding a ternary number as the alphabet is $ |\{A,B,C\}| = 3 $.
This number can then be encoded to the target base (usually 2) with sufficient precision to yield back the original number, resulting in $0.0010110001_2$.
The decoder only gets the rational number $q$ and the length $n$ of the original message.
The encoding can then be easily reversed by changing base and rounding to $n$ digits.
In general, arithmetic coding can produce near-optimal output for any given source probability distribution.
This is achieved by adjusting the intervals that are interpreted as a given source symbol.
Given the following source probabilities of $p_A = \frac{6}{8}, p_B = p_C = \frac{1}{8}$ the intervals would be adjusted to
$ A= [0,\frac{6}{8}), B=(\frac{6}{8}, \frac{7}{8}), C=(\frac{7}{8},1]$.
Instead of transforming the base of the number and rounding to appropriate precision, the encoder recursively refines the interval and in the end chooses a number inside that interval.
\begin{enumerate}
\item \textbf{Symbol:A} $A=[0, \frac{6}{8})$
\item $ A= [0,(\frac{6}{8})^2), B=((\frac{6}{8})^2, \frac{7}{8} \cdot \frac{6}{8}), C=(\frac{7}{8} \cdot \frac{6}{8},1 \cdot \frac{6}{8}]$.
\item \textbf{Symbol:B} $B=((\frac{6}{8})^2, \frac{7}{8} \cdot \frac{6}{8}) = (\frac{36}{64}, \frac{42}{64})$
\end{enumerate}
Depending on implementation, the source message can also be encoded in base $n+1$, reserving room for a special \verb|END-OF-DATA| symbol that the decoder
will look for and consequently stop reading from the input $q$.
\section{LZW Algorithm}
The Lempel-Ziv-Welch (LZW) algorithm is a dictionary-based compression method that dynamically builds a dictionary
of recurring patterns in the data as compression proceeds. Unlike entropy-based methods such as Huffman or arithmetic coding,
LZW does not require prior knowledge of symbol probabilities, making it highly adaptable and efficient
for a wide range of applications, including image and text compression.
The algorithm was developed by Abraham Lempel and Jacob Ziv, with refinements by Terry Welch in 1984.
The fundamental insight of LZW is that many data sources contain repeating patterns that can be exploited
by replacing longer sequences with shorter codes. Rather than assigning variable-length codes to individual symbols
based on their frequency, LZW identifies recurring substrings and assigns them fixed-length codes.
As the algorithm processes the data, it dynamically constructs a dictionary that maps these patterns to codes,
without requiring the dictionary to be transmitted with the compressed data.
\begin{algorithm}
\caption{LZW compression algorithm}
\label{alg:lzw}
\begin{algorithmic}
\Procedure{LZWCompress}{data}
\State Initialize dictionary with all single characters
\State $\text{code} \gets$ next available code (typically 256 for byte alphabet)
\State $w \gets$ first symbol from data
\State $\text{output} \gets [\,]$
\For{each symbol $c$ in remaining data}
\If{$w + c$ exists in dictionary}
\State $w \gets w + c$
\Else
\State append $\text{code}(w)$ to output
\If{code $<$ max\_code}
\State Add $w + c$ to dictionary with code $\text{code}$
\State $\text{code} \gets \text{code} + 1$
\EndIf
\State $w \gets c$
\EndIf
\EndFor
\State append $\text{code}(w)$ to output
\State \Return output
\EndProcedure
\end{algorithmic}
\end{algorithm}
The decompression process is equally elegant. The decompressor initializes an identical dictionary
and reconstructs the original data by decoding the transmitted codes. Crucially, the decompressor
can reconstruct the dictionary entries on-the-fly as it processes the compressed data,
recovering the exact sequence of dictionary updates that occurred during compression.
This property is what allows the dictionary to remain implicit rather than explicitly transmitted.
\begin{algorithm}
\caption{LZW decompression algorithm}
\label{alg:lzw-decompress}
\begin{algorithmic}
\Procedure{LZWDecompress}{codes}
\State Initialize dictionary with all single characters
\State $\text{code} \gets$ next available code
\State $w \gets \text{decode}(\text{codes}[0])$
\State $\text{output} \gets w$
\For{each code $c$ in $\text{codes}[1:]$}
\If{$c$ exists in dictionary}
\State $k \gets \text{decode}(c)$
\Else
\State $k \gets w + w[0]$ \quad \{handle special case\}
\EndIf
\State append $k$ to output
\State Add $w + k[0]$ to dictionary with code $\text{code}$
\State $\text{code} \gets \text{code} + 1$
\State $w \gets k$
\EndFor
\State \Return output
\EndProcedure
\end{algorithmic}
\end{algorithm}
LZW's advantages make it particularly valuable for certain applications. First, it requires no statistical modeling
of the input data, making it applicable to diverse data types without prior analysis.
Second, the dictionary is built incrementally and implicitly, eliminating transmission overhead.
Third, it can achieve significant compression on data with repeating patterns, such as text, images, and structured data.
Fourth, the algorithm is relatively simple to implement and computationally efficient, with time complexity $O(n)$
where $n$ is the length of the input.
However, LZW has notable limitations. Its compression effectiveness is highly dependent on the structure and repetitiveness
of the input data. On truly random data with no repeating patterns, LZW can even increase the file size.
Additionally, the fixed size of the dictionary (typically 12 or 16 bits, allowing $2^12=4096$ or $2^16=65536$ entries)
limits its ability to adapt to arbitrarily large vocabularies of patterns.
When the dictionary becomes full, most implementations stop adding new entries, potentially reducing compression efficiency.
LZW has seen widespread practical deployment in compression standards and applications.
The GIF image format uses LZW compression, as does the TIFF image format in some variants.
The relationship between dictionary-based methods like LZW and entropy-based methods like Huffman
is complementary rather than competitive. LZW excels at capturing structure and repetition,
while entropy-based methods optimize symbol encoding based on probability distributions.
This has led to hybrid approaches that combine both techniques, such as the Deflate algorithm,
which uses LZSS (a variant of LZ77) followed by Huffman coding of the output to achieve better compression ratios.
\printbibliography
\end{document}

31
correction.tex Normal file
View File

@@ -0,0 +1,31 @@
\documentclass{article}
\usepackage[utf8x]{inputenc}
\usepackage[margin=1in]{geometry} % Adjust margins
\usepackage{caption}
\usepackage{wrapfig}
\usepackage{subcaption}
\usepackage{parskip} % dont indent after paragraphs, figures
\usepackage{xcolor}
%\usepackage{csquotes} % Recommended for biblatex
\usepackage{tikz}
\usepackage{pgfplots}
\usetikzlibrary{positioning}
\usepackage{float}
\usepackage{amsmath}
\PassOptionsToPackage{hyphens}{url}
\usepackage{hyperref} % allows urls to follow line breaks of text
\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex}
\addbibresource{entropy.bib}
\title{Error Correction Codes}
\date{\today}
\begin{document}
\maketitle
(theorems: Hamming condition, Varsham-Gilbert)
CRC
\printbibliography
\end{document}

47
crypto.bib Normal file
View File

@@ -0,0 +1,47 @@
@book{luenberger,
title={Information science},
author={Luenberger, David G},
year={2012},
publisher={Princeton University Press}
}
@misc{ enwiki:maryofscots,
author = "{Wikipedia contributors}",
title = "Mary, Queen of Scots --- {Wikipedia}{,} The Free Encyclopedia",
year = "2026",
url = "https://en.wikipedia.org/w/index.php?title=Mary,_Queen_of_Scots&oldid=1333198012",
note = "[Online; accessed 22-January-2026]"
}
@misc{ enwiki:kerckhoff,
author = "{Wikipedia contributors}",
title = "Kerckhoffs's principle --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Kerckhoffs%27s_principle&oldid=1320402404",
note = "[Online; accessed 2-February-2026]"
}
@misc{ enwiki:confusion-diffusion,
author = "{Wikipedia contributors}",
title = "Confusion and diffusion --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Confusion_and_diffusion&oldid=1307746165",
note = "[Online; accessed 3-February-2026]"
}
@ARTICLE{diffiehellman,
author={Diffie, W. and Hellman, M.},
journal={IEEE Transactions on Information Theory},
title={New directions in cryptography},
year={1976},
volume={22},
number={6},
pages={644-654},
keywords={Cryptography;Receivers;Authentication;Eavesdropping;Costs;Business;Public key cryptography},
doi={10.1109/TIT.1976.1055638}}
@article{rsa,
title={A method for obtaining digital signatures and public-key cryptosystems},
author={Rivest, Ronald L and Shamir, Adi and Adleman, Leonard},
journal={Communications of the ACM},
volume={21},
number={2},
pages={120--126},
year={1978},
publisher={ACM New York, NY, USA}
}

176
crypto.tex Normal file
View File

@@ -0,0 +1,176 @@
\documentclass{article}
\usepackage[utf8x]{inputenc}
\usepackage[margin=1in]{geometry} % Adjust margins
\usepackage{caption}
\usepackage{wrapfig}
\usepackage{subcaption}
\usepackage{parskip} % dont indent after paragraphs, figures
\usepackage{xcolor}
%\usepackage{csquotes} % Recommended for biblatex
\usepackage{tikz}
\usepackage{pgfplots}
\usetikzlibrary{positioning}
\usepackage{float}
\usepackage{amsmath}
\PassOptionsToPackage{hyphens}{url}
\usepackage{hyperref} % allows urls to follow line breaks of text
\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex}
\addbibresource{crypto.bib}
\usepackage{glossaries}
\makeglossaries
\newacronym{DES}{DES}{Data Encryption Standard}
\newacronym{AES}{AES}{Advanced Encryption Standard}
\newacronym{RSA}{RSA}{RivestShamirAdleman}
\title{Cryptography}
\date{\today}
\begin{document}
\maketitle
\section{Introduction}
Cryptography is ubiquitous in our modern world.
While the origins of cryptography date back thousands of years, evidence of its use in ancient is sparse.
\cite{luenberger}
Most of its use seemed to be reserved for political and military leaders, e.g. notably Mary Queen of Scots,
who while in prison, plotted to kill Queen Elizabeth using encrypted letters \cite{enwiki:maryofscots}.
With the widespread adoption of the internet, the need for several cryptographical functions arose.
Due to its intended original use as a trusted research network (ARPANET),
almost none of the original protocols were 'secure' in any sense of the word.
Most notably still today is SMTP, the \textit{Simple Mail Transfer Protocol}, used to send email to servers.
In its original implementation, it allowed attackers to intercept emails in transit to read and modify them
and even spoof the sender address to impersonate others.
SMTP today is secured using a combination of mitigations for these attacks, such as STARTTLS, SPF, DKIM and DMARC,
emphasizing the need for securely designed protocols.
\subsection{Security}
Common goals associated with security include the \textit{CIA triad}, consisting of
\begin{itemize}
\item Confidentiality: Prevent unauthorized reading
\item Integrity: Prevent unauthorized modification
\item Availability: Prevent denial of service
\end{itemize}
With further goals including Authenticity and Non-repudiation. Cryptography can help with all of the aforementioned goals
except availability.
This can be achieved using several different applications of cryptography:
\begin{itemize}
\item Encryption provides confidentiality by only saving / transmitting an encrypted message.
\item Hash functions ensure data has not been altered.
\item Digital signatures confirm a message was indeed sent by who we expect it to be, preventing man-in-the-middle attacks
where the message is simply swapped out before reaching its destination, as well as providing proof a message was sent (Non-repudiation).
\item Certificates confirm the sender's identity.
\end{itemize}
Importantly, Kerckhoff's principle \cite{enwiki:kerckhoff} is what allows us to go into detail on the following algorithms.
Embraced by researchers today, it holds that the security of a cryptosystem should only rely on the secrecy of the key,
allowing and encouraging the publication of cryptographic algorithms. \newline
It is closely related to Shannon's maxim, stating that
"one ought to design systems under the assumption that the enemy will immediately gain full familiarity with them".
This is opposed to \textit{security through obscurity}, which doesnt allow for verification of the cryptographic
algorithm through a scientific process in the public domain.
\subsection{Hash Functions}
A general hash function $h(m)$ is a function that takes a message $m$ of arbitrary and produces an output $h$ called \textit{hash}
of fixed length. However, not every mathematical function can be considered a hash function.
The main applications of hash functions include integrity checking and hash maps for efficient data retrieval.
Depending on the applications, different properties determine the usefulness of a function.
An obvious desired property is efficiency - every application benefits from faster computing times.
Also central to all applications of hash functions is a property called \textit{collision resistance}, where there should be no
efficient way, i.e. no better way than brute force to find $m_1 \neq m_2$ so that $h(m_1) = h(m_2)$.
Again, for encryption the importance is clear. If a password is stored in hashed form to obfuscate the clear text,
no security is gained if it is easy for an attacker to find a password that produces the same hash and thus passes the challenge.
A similar notion holds true for data retrieval. If it is too easy to find collisions, e.g. similar inputs produce similar outputs,
there will be an uneven distribution in the target domain and thus little to no efficiency gain.
Another desired property, specifically for encryption is what is usually used synonymously with a hash function: a \textit{one-way function}.
Given $h(m)$, there should be no method more efficient than brute force to find a matching $m$. \newline
As alluded to earlier, hash functions are readily used for integrity checking.
By generating a fixed-size hash value for a given input, they allow users to verify that data has not been altered,
whether intentionally or accidentally.
For example, when downloading a file, comparing its hash with a published checksum ensures the file's integrity.
They are also often used in combination with public key cryptography, allowing the sender to sign with his private key
to prove not only integrity but authenticity.
\subsection{Encryption}
Even though the properties of hash functions are similar to encryption, the fact that the input message is reduced to a fixed size hash
also means that inevitably information is lost by every hash function.
Fundamentally, encryption has the goal of only allowing authorized parties to read a message.
This is achieved by encoding the \textit{plaintext} into a \textit{ciphertext} and then transmitting/storing that ciphertext
separately from the necessary key to decrypt it.
Early encryptions intuitively demonstrate two concepts that can be employed to encode a message:
\textit{substitution} and \textit{transposition}.
\paragraph{Substitution} is used by
the simple Caesar cipher, often achieved by rotating two disks against each other, each with the alphabet written out on them.
\autoref{tab:caesar} shows a simple caesar cipher where the cipher alphabet is simply shifted by $+3$ positions from the plaintext alphabet.
In the process of encoding, A is therefore replaced (substituted) with D, B with E, and so on.
Upon reception of the message, the same process is done in reverse, i.e. shifted by $-3$.
\begin{table}[h]
\resizebox{\textwidth}{!}{%
\begin{tabular}{c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c}
A&B&C&D&E&F&G&H&I&J&K&L&M&N&O&P&Q&R&S&T&U&V&W&X&Y&Z \\
\hline
D&E&F&G&H&I&J&K&L&M&N&O&P&Q&R&S&T&U&V&W&X&Y&Z&A&B&C
\end{tabular}%
}
\caption{A simple substitution cipher demonstrated by a 3-letter shift.}
\label{tab:caesar}
\end{table}
This simple encryption is easy to break however for several reasons.
Caesar ciphers in general only offer 26 different keys as further shifts only wrap around to $29 \mod 26 = 3$, with a shift of 26
being equal to the cleartext. \newline
Furter, by shifting every letter by the same amount,
the properties of the source language such as word spacing and letter frequencies are retained in the ciphertext,
leaving it vulnerable to simple attacks.
\paragraph{Transposition} is the process of reordering the plaintext to obtain a ciphertext.
Here, the key can be understood as instructions on how to re-order the ciphertext to obtain the original message.
The \textit{scytale} is one of the earliest implementations of a transposition cipher.
\paragraph{Confusion and Diffusion} \cite{enwiki:confusion-diffusion}
\section{DES}\label{sec:des}
The \acrfull{DES} is a symmetric (or private-key) cipher developed in the 1970s at IBM as an archetypal block cipher.
It takes in a block of 64 bits and transforms it to a ciphertext using a key of equal length.
Despite suspicions of backdoors engineered into the algorithm due to the involvement of the NSA in the development of \acrshort{DES},
it was approved as a federal standard in the USA in 1976 and only retired due to its short key length,
for which the NSA however was directly responsible as well. \newline
Nevertheless, it sparked public and scientific interest in the research of encryption algorithms, producing a large body of publications.
\section{AES}
The \acrfull{AES} superseded \acrshort{DES} in 2001 after an official selection process.
Unlike its predecessor, it does not use a Feistel network.
\section{RSA}
\acrfull{RSA} is the first asymmetric (or public-key) cryptographic algorithm and can thus be used for encryption and digital signing.
It was named after its eponymous inventors in \citeyear{rsa} after trying to disprove the existence of \textit{trapdoor functions},
a concept introduced by \citeauthor{diffiehellman} in their appropriately named pivotal paper \citetitle{diffiehellman}.
The algorithm they came up with relies on modular arithmetic, which remains the most popular class of asymmetric cryptography.
\begin{enumerate}
\item Choose randomly and stochastically independet primes $p,q$ of similar size so that
$0.1 < | \log_2 p - \log_2 q | < 30 $.
\item Calculate $ N= p \cdot q $
\item Compute Euler's totient function of $ \varphi (N) = (p-1) \cdot (q-1)$ which is kept secret.
\item Choose an integer $e$ so that $ 1 < e < \varphi (N) $ and $\gcd(e, \varphi(N)) =1$, i.e. $e$ and $\varphi(N)$
are coprime. The most common choice here is $ e= 2^(16) +1 = 65537 $, as $e$ is released as part of the public key.
\item For the private key, % TODO
\end{enumerate}
\clearpage
%\printglossary[type=\acronymtype]
%\printglossary
\printbibliography
\end{document}

35
entropy.bib Normal file
View File

@@ -0,0 +1,35 @@
@misc{ enwiki:shannon-hartley,
author = "{Wikipedia contributors}",
title = "ShannonHartley theorem --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633",
note = "[Online; accessed 29-October-2025]"
}
@misc{ enwiki:noisy-channel,
author = "{Wikipedia contributors}",
title = "Noisy-channel coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870",
note = "[Online; accessed 29-October-2025]"
}
@misc{ enwiki:source-coding,
author = "{Wikipedia contributors}",
title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440",
note = "[Online; accessed 29-October-2025]"
}
@misc{ dewiki:nyquist-shannon,
author = "Wikipedia",
title = "Nyquist-Shannon-Abtasttheorem --- Wikipedia{,} die freie Enzyklopädie",
year = "2025",
url = "https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066",
note = "[Online; Stand 29. Oktober 2025]"
}
@misc{ enwiki:information-content,
author = "{Wikipedia contributors}",
title = "Information content --- {Wikipedia}{,} The Free Encyclopedia",
year = "2025",
url = "https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600",
note = "[Online; accessed 29-October-2025]"
}

368
entropy.tex Normal file
View File

@@ -0,0 +1,368 @@
\documentclass{article}
\usepackage[utf8x]{inputenc}
\usepackage[margin=1in]{geometry} % Adjust margins
\usepackage{caption}
\usepackage{wrapfig}
\usepackage{subcaption}
\usepackage{parskip} % dont indent after paragraphs, figures
\usepackage{xcolor}
%\usepackage{csquotes} % Recommended for biblatex
\usepackage{tikz}
\usepackage{pgfplots}
\usetikzlibrary{positioning}
\usepackage{float}
\usepackage{amsmath}
\PassOptionsToPackage{hyphens}{url}
\usepackage{hyperref} % allows urls to follow line breaks of text
\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex}
\addbibresource{entropy.bib}
\title{Entropy as a measure of information}
\date{\today}
\begin{document}
\maketitle
\section{Introduction}
Across disciplines, entropy is a measure of uncertainty or randomness.
Originating in classical thermodynamics,
over time it has been applied in different sciences such as chemistry and information theory.
%As the informal concept of entropy gains popularity, its specific meaning can feel far-fetched and ambiguous.
The name 'entropy' was first coined by german physicist \textit{Rudolf Clausius} in 1865
while postulating the second law of thermodynamics, one of 3(4)
laws of thermodynamics based on universal observation regarding heat and energy conversion.
Specifically, the second law states that not all thermal energy can be converted into work in a cyclic process.
Or, in other words, that the entropy of an isolated system cannot decrease,
as they always tend toward a state of thermodynamic equilibrium where entropy is highest for a given internal energy.
Another result of this observation is the irreversibility of natural processes, also referred to as the \textit{arrow of time}.
Even though the first law (conservation of energy) allows for a cup falling off a table and breaking
as well as the reverse process of reassembling itself and jumping back onto the table,
the second law only allows the former and denies the latter,
requiring the state with higher entropy to occur later in time.
Only 10 years later, in 1875, \textit{Ludwig Boltzmann} and \textit{Willard Gibbs} derived the formal definition
that is still in use in information theory today.
\begin{equation}
S = -k_B \sum_i p_i \ln(p_i)
\end{equation}
It gives statistical meaning to the macroscopic phenomenon of classical thermodynamics
by defining the entropy $S$ of a macrostate as
the result of probabilities $p_i$ of all its constituting micro states.
$k_B$ refers to the Boltzmann constant, which he himself did not determine but is part of todays SI system.
\section{Shannon's axioms}
\textit{Claude Shannon} adapted the concept of entropy to information theory.
In an era of advancing communication technologies, the question he addressed was of increasing importance:
How can messages be encoded and transmitted efficiently?
He proposed 3(4) axioms a measure of information would have to comply with:
\begin{enumerate}
\item $I(1) = 0$: events that always occur do not communicate information.
\item $ I'(p) \leq 0$ is monotonically decreasing in p: an increase in the probability of an event
decreases the information from an observed event, and vice versa.
\item $I(p_1 \cdot p_2) = I(p_1) + I(p_2)$: the information learned from independent events
is the sum of the information learned from each event.
\item $I(p)$ is a twice continuously differentiable function of p.
\end{enumerate}
As a measure, Shannon's formula uses the \textit{Bit}, quantifying the efficiency of codes
and media for transmission and storage.
In information theory, entropy can be understood as the expected information of a message.
\begin{equation}
H = E(I) = - \sum_i p_i \log_2(p_i)
\label{eq:entropy-information}
\end{equation}
This leaves $ I =log(1/p_i) = - log_2(p_i)$, implying that an unexpected message (low probability) carries
more information than one with higher probability.
Intuitively, we can imagine David A. Johnston, a volcanologist reporting day after day that there is no
activity on Mount St. Helens. After a while, we grow to expect this message because it is statistically very likely
that tomorrows message will be the same. When some day we get the message 'Vancouver! This is it!' it carries a lot of information
not only semantically (because it announces the eruption of a volcano) but statistically because it was very unlikely
given the transmission history.
However, uncertainty (entropy) in this situation would be relatively low.
Because we attach high surprise only to the unlikely message of an eruption, the significantly more likely message
carries less information - we already expected it before it arrived.
Putting the axioms and our intuitive understanding of information and uncertainty together,
we can see the logarithmic decay of information transported by a message as its probability increases in \autoref{fig:graph-information},
as well as the entropy for a 2-event source given by solving \autoref{eq:entropy-information} for $i=2$, resulting in
$-p * \log_2(p) - (1-p) * \log_2(1-p) $.
\begin{figure}[H]
\begin{minipage}{.5\textwidth}
\begin{tikzpicture}
\begin{axis}[
domain=0:1,
samples=100,
axis lines=middle,
xlabel={$p$},
ylabel={Information [bits]},
xmin=0, xmax=1,
ymin=0, ymax=6.1,
grid=both,
width=8cm,
height=6cm,
every axis x label/.style={at={(current axis.right of origin)}, anchor=west},
every axis y label/.style={at={(current axis.above origin)}, anchor=south},
]
\addplot[thick, blue] {-log2(x)};
\end{axis}
\end{tikzpicture}
\caption{Information contained in a message depending on its probability $p$}
\label{fig:graph-information}
\end{minipage}
\begin{minipage}{.5\textwidth}
\begin{tikzpicture}
\begin{axis}[
domain=0:1,
samples=100,
axis lines=middle,
xlabel={$p$},
ylabel={Entropy [bits]},
xmin=0, xmax=1,
ymin=0, ymax=1.1,
xtick={0,0.25,0.5,0.75,1},
grid=both,
width=8cm,
height=6cm,
every axis x label/.style={at={(current axis.right of origin)}, anchor=west},
every axis y label/.style={at={(current axis.above origin)}, anchor=south},
]
\addplot[thick, blue] {-x * log2(x) - (1-x) * log2(1-x)};
\end{axis}
\end{tikzpicture}
\caption{Entropy of an event source with two possible events, depending on their probabilities $(p, 1-p)$}
\label{fig:graph-entropy}
\end{minipage}
\end{figure}
The base 2 is chosen for the logarithm as our computers rely on a system of the same base, but theoretically
arbitrary bases can be used as they are proportional according to $\log_a b = \frac{\log_c b}{\log_c a} $.
Further, the $\log_2$ can be intuitively understood for an event source with $2^n$ possible outcomes -
using standard binary coding, we can easily see that a message has to contain $\log_2(2^n) = n$ Bits
in order to be able to encode all possible outcomes.
For numbers where $a \neq 2^n$ such as $a=10$, it is easy to see that there exists a number $a^k = 2^n$
which defines a message size that can encode the outcomes of $k$ event sources with $a$ outcomes each,
leaving the required Bits per event source at $\log_2(a^k) \div k = \log_2(a)$.
%- bedingte Entropie
%- Redundanz
%- Quellentropie
\section{Applications}
\subsection{Decision Trees}
A decision tree is a supervised learning approach commonly used in machine learning.
The goal is to create an algorithm, i.e a series of questions to pose to new data (input variables)
in order to predict the target variable, a class label.
Graphically, each question can be visualized as a node in a tree, splitting the dataset into two or more groups.
This process is applied to the source set and then its resulting sets in a process called \textit{recursive partitioning}.
Once a leaf is reached, the class of the input has been successfully determined.
In order to build the shallowest possible trees, we want to use input variables that minimize uncertainty.
While other measures for the best choice such as the \textit{Gini coefficient} exist,
entropy is a popular measure used in decision trees.
Using what we learned about entropy, we want the maximum decrease in entropy of our target variable,
as explained in~\autoref{ex:decisiontree}.
\begin{figure}[H]
\centering
\begin{minipage}{.3\textwidth}
\begin{tabular}{c|c|c}
& hot & cold \\
\hline
rain &4 &5 \\
\hline
no rain & 3 & 2 \\
\end{tabular}
\end{minipage}
\begin{minipage}{.6\textwidth}
When choosing rain as a target variable, the entropy prior to partitioning is $H_{prior} = H(\frac{9}{14},\frac{5}{14})$,
after partitioning by temperature (hot/cold)$H_{hot}= H(\frac{4}{7}, \frac{3}{7})$
and $H_{cold}= H(\frac{5}{7}, \frac{2}{7})$ remain.
This leaves us with an expected entropy of
$p_{hot} * H_{hot} + p_{cold} * H_{cold} $ .
The \textbf{information gain} can then be calculated as the difference of entropy proor and post partitioning.
Since $H_{prior}$ is constant in this equation, it is sufficient to minimize post-partitioning $E[H]$.
\end{minipage}
\caption{Example of information gain in decision trees}
\label{ex:decisiontree}
\end{figure}
Advantages of decision trees over other machine learning approaches include low computation cost and
interpretability, making it a popular choice for many applications.
However, drawbacks include overfitting and poor robustness, where minimal alterations to training data
can lead to a change in tree structure.
\subsection{Cross-Entropy}
When dealing with two distributions, the \textit{cross-entropy}, also called Kullback-Leibler divergence
between a true distribution $p$
and an estimated distribution $q$ is defined as:
\begin{equation}
H(p, q) = -\sum_x p(x) \log_2 q(x)
\end{equation}
The \textit{KullbackLeibler divergence} measures how much information is lost when $q$
is used to approximate $p$:
\begin{equation}
D_{KL}(p \| q) = H(p, q) - H(p)
\end{equation}
In machine learning, this term appears in many loss/cost functions — notably in classification problems
(cross-entropy loss) and in probabilistic models such as Variational Autoencoders (VAEs).
There, the true and predicted label are used as the true and estimated distribution, respectively.
In a supervised training example, the cross entropy loss degenerates to $-\log(p_{pred i})$ as the
true label vector is assumed to be the unit vector $e_i$ (one-hot).
\subsection{Coding}
The concept of entropy also plays a crucial role in the design and evaluation of codes used for data compression and transmission.
In this context, \textit{coding} refers to the representation of symbols or messages
from a source using a finite set of codewords.
Each codeword is typically composed of a sequence of bits,
and the design goal is to minimize the average length of these codewords while maintaining unique decodability.
According to Shannon's source coding theorem, the theoretical lower bound for the average codeword length of a source
is given by its entropy $H$.
In other words, no lossless coding scheme can achieve an average length smaller than the source entropy when expressed in bits.
Codes that approach this bound are called \textit{efficient} or \textit{entropy-optimal}.
A familiar example of such a scheme is \textit{Huffman coding},
which assigns shorter codewords to more probable symbols and longer ones to less probable symbols,
resulting in a prefix-free code with minimal expected length.
Beyond compression, coding is essential for reliable communication over imperfect channels.
In real-world systems, transmitted bits are often corrupted by noise, requiring mechanisms to detect and correct errors.
One simple but powerful concept to quantify the robustness of a code is the \textit{Hamming distance}.
The Hamming distance between two codewords is defined as the number of bit positions in which they differ.
For example, the codewords $10110$ and $11100$ have a Hamming distance of 2.
A code with a minimum Hamming distance $d_{min}$ can detect up to $d_{min}-1$ errors
and correct up to $\lfloor (d_{min}-1)/2 \rfloor$ errors.
This insight forms the basis of error-correcting codes such as Hamming codes,
which add redundant bits to data in a structured way that enables the receiver to both identify and correct single-bit errors.
Thus, the efficiency and reliability of communication systems are governed by a trade-off:
higher redundancy (lower efficiency) provides greater error correction capability,
while minimal redundancy maximizes data throughput but reduces error resilience.
%Coding of a source of an information and communication channel
% https://www.youtube.com/watch?v=ErfnhcEV1O8
% relation to hamming distance and efficient codes
\subsection{Noisy communication channels}
The noisy channel coding theorem was stated by \textit{Claude Shannon} in 1948, but first rigorous proof was
provided in 1954 by Amiel Feinstein.
One of the important issues Shannon tackled with his 'Mathematical theory of commmunication'
was the insufficient means of transporting discrete data through a noisy channel that were more efficient than
the telegram - or, how to communicate reliably over an unreliable channel.
The means of error correction until then had been limited to very basic means.
\begin{figure}[H]
\begin{tikzpicture}
\def\boxw{2.5cm}
\def\n{5}
\pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
% Draw the boxes
\node (A) at (0, 0) [draw, text width=\boxw, align=center] {Information Source};
\node (B) at (\boxw + \gap, 0) [draw, text width=\boxw, align=center] {Transmitter};
\node (C) at ({2*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Channel};
\node (N) at ({2*(\boxw + \gap)}, -1) [draw, text width=\boxw, align=center] {Noise};
\node (D) at ({3*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Receiver};
\node (E) at ({4*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Destination};
% Draw arrows between the boxes
\draw[->] (A) -- (B);
\draw[->] (B) -- (C);
\draw[->] (C) -- (D);
\draw[->] (D) -- (E);
\draw[->] (N) -- (C);
\end{tikzpicture}
\caption{Model of a noisy communication channel}
\label{fig:noisy-channel}
\end{figure}
First, analogue connections like the first telephone lines, bypassed the issue altogether and relied
on the communicating parties and their brains' ability to filter human voices from the noise that was inevitably transmitted
along with the intended signal.
After some development, the telegraph in its final form used morse code, a series of long and short clicks, that,
together with letter and word gaps, would encode text messages.
Even though the long-short coding might appear similar to todays binary coding, the means of error correction were lacking.
For a long time, it relied on simply repeating the message multiple times, which is highly inefficient.
The destination would then have to determine the most likely intended message by performing a majority vote.
One might also propose simply increasing transmitting power, thereby decreasing the error rate of the associated channel.
However, the noisy channel coding theorem provides us with a more elegant solution.
It is of foundational importance to information theory, stating that given a noisy channel with capacity $C$
and information transmitted at rate $R$, there exists an $R<C$ so the error rate at the receiver can be
arbitrarily small.
\paragraph{Channel capacity and mutual information}
For any discrete memoryless channel, we can describe its behavior with a conditional probability distribution
$p(y|x)$ — the probability that symbol $y$ is received given symbol $x$ was sent.
The \textit{mutual information} between the transmitted and received signals measures how much information, on average, passes through the channel:
\begin{equation}
I(X;Y) = \sum_{x,y} p(x, y) \log_2 \frac{p(x, y)}{p(x)p(y)} = H(Y) - H(Y|X)
\end{equation}
The \textit{channel capacity} $C$ is then defined as the maximum achievable mutual information across all possible input distributions:
\begin{equation}
C = \max_{p(x)} I(X;Y)
\end{equation}
It represents the highest rate (in bits per symbol) at which information can be transmitted with arbitrarily small error,
given optimal encoding and decoding schemes.
\paragraph{Binary symmetric channel (BSC)}
The binary symmetric channel is one example of such discrete memoryless channels, where each transmitted bit
has a probability $p$ of being flipped during transmission and a probability $(1-p)$ of being received correctly.
\begin{figure}[H]
\begin{tikzpicture}
\def\boxw{2.5cm}
\def\n{4}
\pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
\node (S) at (0,0) [draw, align=center, text width=\boxw] {Transmitter};
\node (S0) at (\boxw + \gap,1) [draw, circle] {0};
\node (S1) at (\boxw + \gap,-1) [draw, circle] {1};
\node (D0) at ({2*(\boxw + \gap)},1) [draw, circle] {0};
\node (D1) at ({2*(\boxw + \gap)},-1) [draw, circle] {1};
\node (D) at ({3*(\boxw + \gap)},0) [draw, align=center, text width=\boxw] {Receiver};
\draw[->] (S) -- (S0);
\draw[->] (S) -- (S1);
\draw[->,dashed] (S0) -- (D0) node[midway, above] {$1-p$};
\draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$p$};
\draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$p$};
\draw[->,dashed] (S1) -- (D1) node[midway, below] {$1-p$};
\draw[->] (D0) -- (D);
\draw[->] (D1) -- (D);
\end{tikzpicture}
\caption{Binary symmetric channel with crossover probability $p$}
\label{fig:binary-channel}
\end{figure}
The capacity of the binary symmetric channel is given by:
\begin{equation}
C = 1 - H_2(p)
\end{equation}
where $H_2(p) = -p \log_2(p) - (1-p)\log_2(1-p)$ is the binary entropy function.
As $p$ increases, uncertainty grows and channel capacity declines.
When $p = 0.5$, output bits are completely random and no information can be transmitted ($C = 0$).
As illustrated in \autoref{fig:graph-entropy}, an error rate over $p > 0.5$ is equivalent to $ 1-p < 0.5$,
though not relevant in practice.
Shannons theorem is not constructive as it does not provide an explicit method for constructing such efficient codes,
but it guarantees their existence.
In practice, structured codes such as Hamming and ReedSolomon codes are employed to approach channel capacity.
\section{Conclusion}
Entropy provides a fundamental measure of uncertainty and information,
bridging concepts from thermodynamics to modern communication theory.
Beyond the provided examples, the concept of entropy has far-reaching applications in diverse fields:
from cryptography, where it quantifies randomness and security,
to statistical physics, where it characterizes disorder in complex systems,
to biology, connecting molecular information and population diversity.
\printbibliography
\end{document}