commit 08e7ab4681b6ae55593ef4562d927fedc50f9961 Author: eneller Date: Thu Feb 12 01:17:45 2026 +0100 Initial Commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eab0218 --- /dev/null +++ b/.gitignore @@ -0,0 +1,336 @@ +# Created by https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=latex,visualstudiocode + +### LaTeX ### +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 +.*.lb + +## Intermediate documents: +*.dvi +*.xdv +*-converted-to.* +# these rules might exclude image files for figures etc. +# *.ps +# *.eps +*.pdf + +## Generated if empty string is given at "Please type another file name for output:" +.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync + +## Build tool directories for auxiliary files +# latexrun +latex.out/ + +## Auxiliary and intermediate files from other packages: +# algorithms +*.alg +*.loa + +# achemso +acs-*.bib + +# amsthm +*.thm + +# beamer +*.nav +*.pre +*.snm +*.vrb + +# changes +*.soc + +# comment +*.cut + +# cprotect +*.cpt + +# elsarticle (documentclass of Elsevier journals) +*.spl + +# endnotes +*.ent + +# fixme +*.lox + +# feynmf/feynmp +*.mf +*.mp +*.t[1-9] +*.t[1-9][0-9] +*.tfm + +#(r)(e)ledmac/(r)(e)ledpar +*.end +*.?end +*.[1-9] +*.[1-9][0-9] +*.[1-9][0-9][0-9] +*.[1-9]R +*.[1-9][0-9]R +*.[1-9][0-9][0-9]R +*.eledsec[1-9] +*.eledsec[1-9]R +*.eledsec[1-9][0-9] +*.eledsec[1-9][0-9]R +*.eledsec[1-9][0-9][0-9] +*.eledsec[1-9][0-9][0-9]R + +# glossaries +*.acn +*.acr +*.glg +*.glo +*.gls +*.glsdefs +*.lzo +*.lzs +*.slg +*.slo +*.sls + +# uncomment this for glossaries-extra (will ignore makeindex's style files!) +# *.ist + +# gnuplot +*.gnuplot +*.table + +# gnuplottex +*-gnuplottex-* + +# gregoriotex +*.gaux +*.glog +*.gtex + +# htlatex +*.4ct +*.4tc +*.idv +*.lg +*.trc +*.xref + +# hyperref +*.brf + +# knitr +*-concordance.tex +# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files +# *.tikz +*-tikzDictionary + +# listings +*.lol + +# luatexja-ruby +*.ltjruby + +# makeidx +*.idx +*.ilg +*.ind + +# minitoc +*.maf +*.mlf +*.mlt +*.mtc[0-9]* +*.slf[0-9]* +*.slt[0-9]* +*.stc[0-9]* + +# minted +_minted* +*.pyg + +# morewrites +*.mw + +# newpax +*.newpax + +# nomencl +*.nlg +*.nlo +*.nls + +# pax +*.pax + +# pdfpcnotes +*.pdfpc + +# sagetex +*.sagetex.sage +*.sagetex.py +*.sagetex.scmd + +# scrwfile +*.wrt + +# svg +svg-inkscape/ + +# sympy +*.sout +*.sympy +sympy-plots-for-*.tex/ + +# pdfcomment +*.upa +*.upb + +# pythontex +*.pytxcode +pythontex-files-*/ + +# tcolorbox +*.listing + +# thmtools +*.loe + +# TikZ & PGF +*.dpth +*.md5 +*.auxlock + +# titletoc +*.ptc + +# todonotes +*.tdo + +# vhistory +*.hst +*.ver + +# easy-todo +*.lod + +# xcolor +*.xcp + +# xmpincl +*.xmpi + +# xindy +*.xdy + +# xypic precompiled matrices and outlines +*.xyc +*.xyd + +# endfloat +*.ttt +*.fff + +# Latexian +TSWLatexianTemp* + +## Editors: +# WinEdt +*.bak +*.sav + +# Texpad +.texpadtmp + +# LyX +*.lyx~ + +# Kile +*.backup + +# gummi +.*.swp + +# KBibTeX +*~[0-9]* + +# TeXnicCenter +*.tps + +# auto folder when using emacs and auctex +./auto/* +*.el + +# expex forward references with \gathertags +*-tags.tex + +# standalone packages +*.sta + +# Makeindex log files +*.lpz + +# xwatermark package +*.xwm + +# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib +# option is specified. Footnotes are the stored in a file with suffix Notes.bib. +# Uncomment the next line to have this generated file ignored. +#*Notes.bib + +### LaTeX Patch ### +# LIPIcs / OASIcs +*.vtc + +# glossaries +*.glstex + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode +bib +*.*-SAVE-ERROR +*.ist diff --git a/.latexmkrc b/.latexmkrc new file mode 100644 index 0000000..be9393a --- /dev/null +++ b/.latexmkrc @@ -0,0 +1,5 @@ +$latex = 'latex %O --shell-escape %S'; +$pdflatex = 'pdflatex %O --shell-escape %S'; +$pdf_mode = 1; +$clean_ext = "lol nav snm loa bbl* glo ist"; +$bibtex_use = 2; diff --git a/compression.bib b/compression.bib new file mode 100644 index 0000000..850649c --- /dev/null +++ b/compression.bib @@ -0,0 +1,94 @@ +@article{shannon1948mathematical, + title={A mathematical theory of communication}, + author={Shannon, Claude E}, + journal={The Bell system technical journal}, + volume={27}, + number={3}, + pages={379--423}, + year={1948}, + publisher={Nokia Bell Labs} +} +@misc{ enwiki:shannon-source-coding, + author = "{Wikipedia contributors}", + title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440", + note = "[Online; accessed 25-November-2025]" +} +@misc{ enwiki:shannon-fano, + author = "{Wikipedia contributors}", + title = "Shannon–Fano coding --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Fano_coding&oldid=1315776380", + note = "[Online; accessed 26-November-2025]" +} +@misc{ enwiki:huffman-code, + author = "{Wikipedia contributors}", + title = "Huffman coding --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Huffman_coding&oldid=1321991625", + note = "[Online; accessed 26-November-2025]" +} +@misc{ enwiki:lzw, + author = "{Wikipedia contributors}", + title = "Lempel–Ziv–Welch --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Lempel%E2%80%93Ziv%E2%80%93Welch&oldid=1307959679", + note = "[Online; accessed 26-November-2025]" +} +@misc{ enwiki:arithmetic-code, + author = "{Wikipedia contributors}", + title = "Arithmetic coding --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Arithmetic_coding&oldid=1320999535", + note = "[Online; accessed 26-November-2025]" +} +@misc{ enwiki:kraft-mcmillan, + author = "{Wikipedia contributors}", + title = "Kraft–McMillan inequality --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Kraft%E2%80%93McMillan_inequality&oldid=1313803157", + note = "[Online; accessed 26-November-2025]" +} +@misc{ enwiki:partition, + author = "{Wikipedia contributors}", + title = "Partition problem --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Partition_problem&oldid=1320732818", + note = "[Online; accessed 30-November-2025]" +} +@misc{ dewiki:shannon-fano, + author = "Wikipedia", + title = "Shannon-Fano-Kodierung --- Wikipedia{,} die freie Enzyklopädie", + year = "2024", + url = "https://de.wikipedia.org/w/index.php?title=Shannon-Fano-Kodierung&oldid=246624798", + note = "[Online; Stand 26. November 2025]" +} +@misc{ dewiki:huffman-code, + author = "Wikipedia", + title = "Huffman-Kodierung --- Wikipedia{,} die freie Enzyklopädie", + year = "2025", + url = "https://de.wikipedia.org/w/index.php?title=Huffman-Kodierung&oldid=254369306", + note = "[Online; Stand 26. November 2025]" +} +@misc{ dewiki:lzw, + author = "Wikipedia", + title = "Lempel-Ziv-Welch-Algorithmus --- Wikipedia{,} die freie Enzyklopädie", + year = "2025", + url = "https://de.wikipedia.org/w/index.php?title=Lempel-Ziv-Welch-Algorithmus&oldid=251943809", + note = "[Online; Stand 26. November 2025]" +} +@misc{ dewiki:kraft-mcmillan, + author = "Wikipedia", + title = "Kraft-Ungleichung --- Wikipedia{,} die freie Enzyklopädie", + year = "2018", + url = "https://de.wikipedia.org/w/index.php?title=Kraft-Ungleichung&oldid=172862410", + note = "[Online; Stand 26. November 2025]" +} +@misc{ dewiki:partition, + author = "Wikipedia", + title = "Partitionsproblem --- Wikipedia{,} die freie Enzyklopädie", + year = "2025", + url = "https://de.wikipedia.org/w/index.php?title=Partitionsproblem&oldid=255787013", + note = "[Online; Stand 26. November 2025]" +} \ No newline at end of file diff --git a/compression.tex b/compression.tex new file mode 100644 index 0000000..fdba431 --- /dev/null +++ b/compression.tex @@ -0,0 +1,332 @@ +\documentclass{article} +%%% basic layouting +\usepackage[utf8x]{inputenc} +\usepackage[margin=1in]{geometry} % Adjust margins +\usepackage{caption} +\usepackage{hyperref} +\PassOptionsToPackage{hyphens}{url} % allow breaking urls +\usepackage{float} +\usepackage{wrapfig} +\usepackage{subcaption} +\usepackage{parskip} % dont indent after paragraphs, figures +\usepackage{xcolor} +%%% algorithms +\usepackage{algorithm} +\usepackage{algpseudocodex} +% graphs and plots +\usepackage{tikz} +\usepackage{pgfplots} +\usetikzlibrary{positioning} +\usetikzlibrary{trees} +%\usetikzlibrary{graphs, graphdrawing} +%%% math +\usepackage{amsmath} +%%% citations +\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex} +%\usepackage{csquotes} % Recommended for biblatex +\addbibresource{compression.bib} + +\title{Compression} +\date{\today} + +\begin{document} +\maketitle +\section{Introduction} +As the volume of data grows exponentially around the world, compression is only gaining in importance to all disciplines. +Not only does it enable the storage of large amounts of information needed for research in scientific domains +like DNA sequencing and analysis, it also plays a vital role in keeping stored data accessible by +facilitating cataloging, search and retrieval. +The concept of entropy introduced in the previous entry is closely related to the design of efficient codes for compression. +In coding theory, the events of an information source are to be encoded in a manner that minimizes the bits needed to store +the information provided by the source. +The process of encoding can thus be described by a function $C$ transforming from a source alphabet $X$ to a code alphabet $Y$. +Symbols in the alphabets are denominated $x_i$ and $y_j$ respectively, and have underlying probabilities $p_{i}$. +% TODO fix use of alphabet / symbol / code word: alphabet is usually binary -> code word is 010101 +\begin{equation} + C: X \rightarrow Y \qquad X=\{x_1,x_2,...x_n\} \qquad Y=\{y_1,y_2,...y_m\} + \label{eq:formal-code} +\end{equation} + +The understanding of entropy as the expected information $E(I)$ of a message provides an intuition that, +given a source with a given entropy (in bits), any coding can not have a lower average word length $l_j$ (in bits) +than this entropy without losing information. +\begin{equation} + H = E(I) = - \sum_i p_i \log_2(p_i) \quad \leq \quad E(L) = \sum_i p_j l_j + \label{eq:entropy-information} +\end{equation} +This is the content of Shannons's source coding theorem, +introduced in \citeyear{shannon1948mathematical}. +In his paper, \citeauthor{shannon1948mathematical} proposed two principal ideas to minimize the average length of a code. +The first is to use short codes for symbols with higher probability. +This is an intuitive approach as more frequent symbols have a higher impact on average code length. +The second idea is to encode events that frequently occur together at the same time, artificially increasing +the size of the code alphabet $Y$ to allow for greater flexibility in code design.\cite{enwiki:shannon-source-coding} + +Codes can have several properties. A code where all codewords have equal lengths is called a \textit{block code}. +While easy to construct, they are not well suited for our goal of minimizing average word length +as specified in \autoref{eq:entropy-information} because the source alphabet is generally not equally distributed +in a way that $p_i = \frac{1}{n}$. + +In order to send (or store, for that matter) multiple code words in succession, a code $Y$ has to be uniquely decodable. +When receiving 0010 in succesion using the nonsingular code $Y_2$ from \autoref{tab:code-properties}, +it is not clear to the recipient which source symbols make up the intended message. +For the specified sequence, there are a total of three possibilities to decode the received code: +$s_0 s_3 s_0$, $s_0 s_0 s_1$ or $s_2 s_1$ could all be the intended message, making the code useless. + +\begin{table}[H] +\centering +\begin{tabular}{c l l l} + Source Code $X$ & Prefix Code $Y_0$ & Suffix Code $Y_1$ & Nonsingular Code $Y_2$ \\ + \hline + $s_0$ & 0 & 0 & 0 \\ + $s_1$ & 10 & 01 & 10 \\ + $s_2$ & 110 & 011 & 00 \\ + $s_3$ & 1110 & 0111 & 01 \\ +\end{tabular} +\caption{Examples of different properties of codes} +\label{tab:code-properties} +\end{table} +Another interesting property of a code that is specifically important for transmission but less so for storage, is +being prefix-free. +A prefix code (which is said to be prefix-free) can be decoded by the receiver of the symbol as soon as it is received +because no code word $y_j$ is the prefix of another valid code word. +As shown in \autoref{tab:code-properties} $Y_0$ is a prefix code, in this case more specifically called a \textit{comma code} +because each code word is separated by a trailing 0 from the next code word. +$Y_1$ in contrast is called a \textit{capital code} (capitalizes the beginning of each word) and is not a prefix code. +In the case of the capital code in fact every word other than the longest possible code word is a prefix of the longer words +lower in the table. As a result, the receiver cannot instantaneously decode each word but rather has to wait for the leading 0 +of the next codeword. + + +Further, a code is said to be \textit{efficient} if it has the smallest possible average word length, i.e. matches +the entropy of the source alphabet. + +\section{Kraft-McMillan inequality} +The Kraft-McMillan inequality gives a necessary and sufficient condition for the existence of a prefix code. +In the form shown in \autoref{eq:kraft-mcmillan} it is intuitive to understand given a code tree. +Because prefix codes require code words to only be situated on the leaves of a code tree, +for every code word $i$ using an alphabet of size $r$, it uses up exactly $r^{-l_i}$ of the available code words. +The sum over all of them can thus never be larger than one else +the code will not be uniquely decodable \cite{enwiki:kraft-mcmillan}. +\begin{equation} + \sum_l r^{-l_i} \leq 1 + \label{eq:kraft-mcmillan} +\end{equation} + +\section{Shannon-Fano} +Shannon-Fano coding is one of the earliest methods for constructing prefix codes. +It is a top-down method that divides symbols into equal groups based on their probabilities, +recursively partitioning them to assign shorter codewords to more frequent events. + +\begin{algorithm} +\caption{Shannon-Fano compression} +\label{alg:shannon-fano} +\begin{algorithmic} + \Procedure{ShannonFano}{symbols, probabilities} + \If{length(symbols) $= 1$} + \State \Return codeword for single symbol + \EndIf + \State $\text{current\_sum} \gets 0$ + \State $\text{split\_index} \gets 0$ + \For{$i \gets 1$ \textbf{to} length(symbols)} + \If{$|\text{current\_sum} + \text{probabilities}[i] - 0.5| < |\text{current\_sum} - 0.5|$} + \State $\text{current\_sum} \gets \text{current\_sum} + \text{probabilities}[i]$ + \State $\text{split\_index} \gets i$ + \EndIf + \EndFor + \State $\text{left\_group} \gets \text{symbols}[1 : \text{split\_index}]$ + \State $\text{right\_group} \gets \text{symbols}[\text{split\_index} + 1 : \text{length(symbols)}]$ + \State Assign prefix ``0'' to codes from ShannonFano($\text{left\_group}, \ldots$) + \State Assign prefix ``1'' to codes from ShannonFano($\text{right\_group}, \ldots$) + \EndProcedure +\end{algorithmic} +\end{algorithm} + +While Shannon-Fano coding guarantees the generation of a prefix-free code with an average word length close to the entropy, +it is not guaranteed to be optimal. In practice, it often generates codewords that are only slightly longer than necessary. +Weaknesses of the algorithm also include the non-trivial partitioning phase \cite{enwiki:partition}, which can in practice however be solved relatively efficiently. +Due to the aforementioned limitations, neither of the two historically slightly ambiguous Shannon-Fano algorithms are almost never used, +in favor of the Huffman coding as described in the next section. + +\section{Huffman Coding} +\label{sec:huffman} +Huffman coding is an optimal prefix coding algorithm that minimizes the expected codeword length +for a given set of symbol probabilities. Developed by David Huffman in 1952, it guarantees optimality +by constructing a binary tree where the most frequent symbols are assigned the shortest codewords. +Huffman coding achieves the theoretical limit of entropy for discrete memoryless sources, +making it one of the most important compression techniques in information theory. + +Unlike Shannon-Fano, which uses a top-down approach, Huffman coding employs a bottom-up strategy. +The algorithm builds the code tree by iteratively combining the two symbols with the lowest probabilities +into a new internal node. This greedy approach ensures that the resulting tree minimizes the weighted path length, +where the weight of each symbol is its probability. + +\begin{algorithm} +\caption{Huffman coding algorithm} +\label{alg:huffman} +\begin{algorithmic} + \Procedure{Huffman}{symbols, probabilities} + \State Create a leaf node for each symbol and add it to a priority queue + \While{priority queue contains more than one node} + \State Extract two nodes with minimum frequency: $\text{left}$ and $\text{right}$ + \State Create a new internal node with frequency $\text{freq(left)} + \text{freq(right)}$ + \State Set $\text{left}$ as the left child and $\text{right}$ as the right child + \State Add the new internal node to the priority queue + \EndWhile + \State $\text{root} \gets$ remaining node in priority queue + \State Traverse tree and assign codewords: ``0'' for left edges, ``1'' for right edges + \State \Return codewords + \EndProcedure +\end{algorithmic} +\end{algorithm} + +The optimality of Huffman coding can be proven by exchange arguments. +The key insight is that if two codewords have the maximum length in an optimal code, they must correspond to the two least frequent symbols. +Moreover, these two symbols can be combined into a single meta-symbol without affecting optimality, +which leads to a recursive structure that guarantees Huffman's method produces an optimal code. + +The average codeword length $L_{\text{Huffman}}$ produced by Huffman coding satisfies the following bounds: +\begin{equation} + H(X) \leq L_{\text{Huffman}} < H(X) + 1 + \label{eq:huffman-bounds} +\end{equation} +where $H(X)$ is the entropy of the source. This means Huffman coding is guaranteed to be within one bit +of the theoretical optimum. In practice, when symbol probabilities are powers of $\frac{1}{2}$, +Huffman coding achieves perfect compression and $L_{\text{Huffman}} = H(X)$. + +The computational complexity of Huffman coding is $O(n \log n)$, where $n$ is the number of distinct symbols. +A priority queue implementation using a binary heap achieves this bound, making Huffman coding +efficient even for large alphabets. Its widespread use in compression formats such as DEFLATE, JPEG, and MP3 +testifies to its practical importance. + +However, Huffman coding has limitations. First, it requires knowledge of the probability distribution +of symbols before encoding, necessitating a preprocessing pass or transmission of frequency tables. +Second, it assigns an integer number of bits to each symbol, which can be suboptimal +when symbol probabilities do not align well with powers of two. +Symbol-by-symbol coding imposes a constraint that is often unneeded since codes will usually be packed in long sequences, +leaving room for further optimization as provided by Arithmetic Coding. + +\section{Arithmetic Coding} +Arithmetic coding is a modern compression technique that encodes an entire message as a single interval +within the range $[0, 1)$, as opposed to symbol-by-symbol coding used by Huffman. +By iteratively refining this interval based on the probabilities of the symbols in the message, +arithmetic coding can achieve compression rates that approach the entropy of the source. +Its ability to handle non-integer bit lengths makes it particularly powerful +for applications requiring high compression efficiency. + +In the basic form, a message is first written in the base of the alphabet with a leading '$0.$': $ \text{ABBCAB} = 0.011201_3$, +in this case yielding a ternary number as the alphabet is $ |\{A,B,C\}| = 3 $. +This number can then be encoded to the target base (usually 2) with sufficient precision to yield back the original number, resulting in $0.0010110001_2$. +The decoder only gets the rational number $q$ and the length $n$ of the original message. +The encoding can then be easily reversed by changing base and rounding to $n$ digits. + +In general, arithmetic coding can produce near-optimal output for any given source probability distribution. +This is achieved by adjusting the intervals that are interpreted as a given source symbol. +Given the following source probabilities of $p_A = \frac{6}{8}, p_B = p_C = \frac{1}{8}$ the intervals would be adjusted to +$ A= [0,\frac{6}{8}), B=(\frac{6}{8}, \frac{7}{8}), C=(\frac{7}{8},1]$. +Instead of transforming the base of the number and rounding to appropriate precision, the encoder recursively refines the interval and in the end chooses a number inside that interval. +\begin{enumerate} + \item \textbf{Symbol:A} $A=[0, \frac{6}{8})$ + \item $ A= [0,(\frac{6}{8})^2), B=((\frac{6}{8})^2, \frac{7}{8} \cdot \frac{6}{8}), C=(\frac{7}{8} \cdot \frac{6}{8},1 \cdot \frac{6}{8}]$. + \item \textbf{Symbol:B} $B=((\frac{6}{8})^2, \frac{7}{8} \cdot \frac{6}{8}) = (\frac{36}{64}, \frac{42}{64})$ +\end{enumerate} + +Depending on implementation, the source message can also be encoded in base $n+1$, reserving room for a special \verb|END-OF-DATA| symbol that the decoder +will look for and consequently stop reading from the input $q$. + +\section{LZW Algorithm} +The Lempel-Ziv-Welch (LZW) algorithm is a dictionary-based compression method that dynamically builds a dictionary +of recurring patterns in the data as compression proceeds. Unlike entropy-based methods such as Huffman or arithmetic coding, +LZW does not require prior knowledge of symbol probabilities, making it highly adaptable and efficient +for a wide range of applications, including image and text compression. +The algorithm was developed by Abraham Lempel and Jacob Ziv, with refinements by Terry Welch in 1984. + +The fundamental insight of LZW is that many data sources contain repeating patterns that can be exploited +by replacing longer sequences with shorter codes. Rather than assigning variable-length codes to individual symbols +based on their frequency, LZW identifies recurring substrings and assigns them fixed-length codes. +As the algorithm processes the data, it dynamically constructs a dictionary that maps these patterns to codes, +without requiring the dictionary to be transmitted with the compressed data. + +\begin{algorithm} +\caption{LZW compression algorithm} +\label{alg:lzw} +\begin{algorithmic} + \Procedure{LZWCompress}{data} + \State Initialize dictionary with all single characters + \State $\text{code} \gets$ next available code (typically 256 for byte alphabet) + \State $w \gets$ first symbol from data + \State $\text{output} \gets [\,]$ + \For{each symbol $c$ in remaining data} + \If{$w + c$ exists in dictionary} + \State $w \gets w + c$ + \Else + \State append $\text{code}(w)$ to output + \If{code $<$ max\_code} + \State Add $w + c$ to dictionary with code $\text{code}$ + \State $\text{code} \gets \text{code} + 1$ + \EndIf + \State $w \gets c$ + \EndIf + \EndFor + \State append $\text{code}(w)$ to output + \State \Return output + \EndProcedure +\end{algorithmic} +\end{algorithm} + +The decompression process is equally elegant. The decompressor initializes an identical dictionary +and reconstructs the original data by decoding the transmitted codes. Crucially, the decompressor +can reconstruct the dictionary entries on-the-fly as it processes the compressed data, +recovering the exact sequence of dictionary updates that occurred during compression. +This property is what allows the dictionary to remain implicit rather than explicitly transmitted. + +\begin{algorithm} +\caption{LZW decompression algorithm} +\label{alg:lzw-decompress} +\begin{algorithmic} + \Procedure{LZWDecompress}{codes} + \State Initialize dictionary with all single characters + \State $\text{code} \gets$ next available code + \State $w \gets \text{decode}(\text{codes}[0])$ + \State $\text{output} \gets w$ + \For{each code $c$ in $\text{codes}[1:]$} + \If{$c$ exists in dictionary} + \State $k \gets \text{decode}(c)$ + \Else + \State $k \gets w + w[0]$ \quad \{handle special case\} + \EndIf + \State append $k$ to output + \State Add $w + k[0]$ to dictionary with code $\text{code}$ + \State $\text{code} \gets \text{code} + 1$ + \State $w \gets k$ + \EndFor + \State \Return output + \EndProcedure +\end{algorithmic} +\end{algorithm} + +LZW's advantages make it particularly valuable for certain applications. First, it requires no statistical modeling +of the input data, making it applicable to diverse data types without prior analysis. +Second, the dictionary is built incrementally and implicitly, eliminating transmission overhead. +Third, it can achieve significant compression on data with repeating patterns, such as text, images, and structured data. +Fourth, the algorithm is relatively simple to implement and computationally efficient, with time complexity $O(n)$ +where $n$ is the length of the input. + +However, LZW has notable limitations. Its compression effectiveness is highly dependent on the structure and repetitiveness +of the input data. On truly random data with no repeating patterns, LZW can even increase the file size. +Additionally, the fixed size of the dictionary (typically 12 or 16 bits, allowing $2^12=4096$ or $2^16=65536$ entries) +limits its ability to adapt to arbitrarily large vocabularies of patterns. +When the dictionary becomes full, most implementations stop adding new entries, potentially reducing compression efficiency. + +LZW has seen widespread practical deployment in compression standards and applications. +The GIF image format uses LZW compression, as does the TIFF image format in some variants. + +The relationship between dictionary-based methods like LZW and entropy-based methods like Huffman +is complementary rather than competitive. LZW excels at capturing structure and repetition, +while entropy-based methods optimize symbol encoding based on probability distributions. +This has led to hybrid approaches that combine both techniques, such as the Deflate algorithm, +which uses LZSS (a variant of LZ77) followed by Huffman coding of the output to achieve better compression ratios. + + +\printbibliography +\end{document} diff --git a/correction.tex b/correction.tex new file mode 100644 index 0000000..0523f80 --- /dev/null +++ b/correction.tex @@ -0,0 +1,31 @@ +\documentclass{article} +\usepackage[utf8x]{inputenc} +\usepackage[margin=1in]{geometry} % Adjust margins +\usepackage{caption} +\usepackage{wrapfig} +\usepackage{subcaption} +\usepackage{parskip} % dont indent after paragraphs, figures +\usepackage{xcolor} +%\usepackage{csquotes} % Recommended for biblatex +\usepackage{tikz} +\usepackage{pgfplots} +\usetikzlibrary{positioning} +\usepackage{float} +\usepackage{amsmath} +\PassOptionsToPackage{hyphens}{url} +\usepackage{hyperref} % allows urls to follow line breaks of text +\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex} +\addbibresource{entropy.bib} + + + + +\title{Error Correction Codes} +\date{\today} + +\begin{document} +\maketitle +(theorems: Hamming condition, Varsham-Gilbert) +CRC +\printbibliography +\end{document} diff --git a/crypto.bib b/crypto.bib new file mode 100644 index 0000000..e2521ce --- /dev/null +++ b/crypto.bib @@ -0,0 +1,47 @@ +@book{luenberger, + title={Information science}, + author={Luenberger, David G}, + year={2012}, + publisher={Princeton University Press} +} +@misc{ enwiki:maryofscots, + author = "{Wikipedia contributors}", + title = "Mary, Queen of Scots --- {Wikipedia}{,} The Free Encyclopedia", + year = "2026", + url = "https://en.wikipedia.org/w/index.php?title=Mary,_Queen_of_Scots&oldid=1333198012", + note = "[Online; accessed 22-January-2026]" +} +@misc{ enwiki:kerckhoff, + author = "{Wikipedia contributors}", + title = "Kerckhoffs's principle --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Kerckhoffs%27s_principle&oldid=1320402404", + note = "[Online; accessed 2-February-2026]" +} +@misc{ enwiki:confusion-diffusion, + author = "{Wikipedia contributors}", + title = "Confusion and diffusion --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Confusion_and_diffusion&oldid=1307746165", + note = "[Online; accessed 3-February-2026]" +} +@ARTICLE{diffiehellman, + author={Diffie, W. and Hellman, M.}, + journal={IEEE Transactions on Information Theory}, + title={New directions in cryptography}, + year={1976}, + volume={22}, + number={6}, + pages={644-654}, + keywords={Cryptography;Receivers;Authentication;Eavesdropping;Costs;Business;Public key cryptography}, + doi={10.1109/TIT.1976.1055638}} +@article{rsa, + title={A method for obtaining digital signatures and public-key cryptosystems}, + author={Rivest, Ronald L and Shamir, Adi and Adleman, Leonard}, + journal={Communications of the ACM}, + volume={21}, + number={2}, + pages={120--126}, + year={1978}, + publisher={ACM New York, NY, USA} +} diff --git a/crypto.tex b/crypto.tex new file mode 100644 index 0000000..8472838 --- /dev/null +++ b/crypto.tex @@ -0,0 +1,176 @@ +\documentclass{article} +\usepackage[utf8x]{inputenc} +\usepackage[margin=1in]{geometry} % Adjust margins +\usepackage{caption} +\usepackage{wrapfig} +\usepackage{subcaption} +\usepackage{parskip} % dont indent after paragraphs, figures +\usepackage{xcolor} +%\usepackage{csquotes} % Recommended for biblatex +\usepackage{tikz} +\usepackage{pgfplots} +\usetikzlibrary{positioning} +\usepackage{float} +\usepackage{amsmath} +\PassOptionsToPackage{hyphens}{url} +\usepackage{hyperref} % allows urls to follow line breaks of text +\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex} +\addbibresource{crypto.bib} +\usepackage{glossaries} +\makeglossaries +\newacronym{DES}{DES}{Data Encryption Standard} +\newacronym{AES}{AES}{Advanced Encryption Standard} +\newacronym{RSA}{RSA}{Rivest–Shamir–Adleman} + + + + +\title{Cryptography} +\date{\today} + +\begin{document} +\maketitle +\section{Introduction} +Cryptography is ubiquitous in our modern world. +While the origins of cryptography date back thousands of years, evidence of its use in ancient is sparse. +\cite{luenberger} +Most of its use seemed to be reserved for political and military leaders, e.g. notably Mary Queen of Scots, +who while in prison, plotted to kill Queen Elizabeth using encrypted letters \cite{enwiki:maryofscots}. +With the widespread adoption of the internet, the need for several cryptographical functions arose. +Due to its intended original use as a trusted research network (ARPANET), +almost none of the original protocols were 'secure' in any sense of the word. + +Most notably still today is SMTP, the \textit{Simple Mail Transfer Protocol}, used to send email to servers. +In its original implementation, it allowed attackers to intercept emails in transit to read and modify them +and even spoof the sender address to impersonate others. +SMTP today is secured using a combination of mitigations for these attacks, such as STARTTLS, SPF, DKIM and DMARC, +emphasizing the need for securely designed protocols. + +\subsection{Security} +Common goals associated with security include the \textit{CIA triad}, consisting of +\begin{itemize} + \item Confidentiality: Prevent unauthorized reading + \item Integrity: Prevent unauthorized modification + \item Availability: Prevent denial of service +\end{itemize} +With further goals including Authenticity and Non-repudiation. Cryptography can help with all of the aforementioned goals +except availability. +This can be achieved using several different applications of cryptography: +\begin{itemize} + \item Encryption provides confidentiality by only saving / transmitting an encrypted message. + \item Hash functions ensure data has not been altered. + \item Digital signatures confirm a message was indeed sent by who we expect it to be, preventing man-in-the-middle attacks + where the message is simply swapped out before reaching its destination, as well as providing proof a message was sent (Non-repudiation). + \item Certificates confirm the sender's identity. +\end{itemize} + +Importantly, Kerckhoff's principle \cite{enwiki:kerckhoff} is what allows us to go into detail on the following algorithms. +Embraced by researchers today, it holds that the security of a cryptosystem should only rely on the secrecy of the key, +allowing and encouraging the publication of cryptographic algorithms. \newline +It is closely related to Shannon's maxim, stating that +"one ought to design systems under the assumption that the enemy will immediately gain full familiarity with them". +This is opposed to \textit{security through obscurity}, which doesnt allow for verification of the cryptographic +algorithm through a scientific process in the public domain. + +\subsection{Hash Functions} +A general hash function $h(m)$ is a function that takes a message $m$ of arbitrary and produces an output $h$ called \textit{hash} +of fixed length. However, not every mathematical function can be considered a hash function. +The main applications of hash functions include integrity checking and hash maps for efficient data retrieval. +Depending on the applications, different properties determine the usefulness of a function. + +An obvious desired property is efficiency - every application benefits from faster computing times. +Also central to all applications of hash functions is a property called \textit{collision resistance}, where there should be no +efficient way, i.e. no better way than brute force to find $m_1 \neq m_2$ so that $h(m_1) = h(m_2)$. +Again, for encryption the importance is clear. If a password is stored in hashed form to obfuscate the clear text, +no security is gained if it is easy for an attacker to find a password that produces the same hash and thus passes the challenge. +A similar notion holds true for data retrieval. If it is too easy to find collisions, e.g. similar inputs produce similar outputs, +there will be an uneven distribution in the target domain and thus little to no efficiency gain. + +Another desired property, specifically for encryption is what is usually used synonymously with a hash function: a \textit{one-way function}. +Given $h(m)$, there should be no method more efficient than brute force to find a matching $m$. \newline +As alluded to earlier, hash functions are readily used for integrity checking. +By generating a fixed-size hash value for a given input, they allow users to verify that data has not been altered, +whether intentionally or accidentally. +For example, when downloading a file, comparing its hash with a published checksum ensures the file's integrity. +They are also often used in combination with public key cryptography, allowing the sender to sign with his private key +to prove not only integrity but authenticity. + + + +\subsection{Encryption} +Even though the properties of hash functions are similar to encryption, the fact that the input message is reduced to a fixed size hash +also means that inevitably information is lost by every hash function. +Fundamentally, encryption has the goal of only allowing authorized parties to read a message. +This is achieved by encoding the \textit{plaintext} into a \textit{ciphertext} and then transmitting/storing that ciphertext +separately from the necessary key to decrypt it. + +Early encryptions intuitively demonstrate two concepts that can be employed to encode a message: +\textit{substitution} and \textit{transposition}. + +\paragraph{Substitution} is used by +the simple Caesar cipher, often achieved by rotating two disks against each other, each with the alphabet written out on them. +\autoref{tab:caesar} shows a simple caesar cipher where the cipher alphabet is simply shifted by $+3$ positions from the plaintext alphabet. +In the process of encoding, A is therefore replaced (substituted) with D, B with E, and so on. +Upon reception of the message, the same process is done in reverse, i.e. shifted by $-3$. + +\begin{table}[h] +\resizebox{\textwidth}{!}{% +\begin{tabular}{c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c} + A&B&C&D&E&F&G&H&I&J&K&L&M&N&O&P&Q&R&S&T&U&V&W&X&Y&Z \\ + \hline + D&E&F&G&H&I&J&K&L&M&N&O&P&Q&R&S&T&U&V&W&X&Y&Z&A&B&C + +\end{tabular}% +} +\caption{A simple substitution cipher demonstrated by a 3-letter shift.} +\label{tab:caesar} +\end{table} + +This simple encryption is easy to break however for several reasons. +Caesar ciphers in general only offer 26 different keys as further shifts only wrap around to $29 \mod 26 = 3$, with a shift of 26 +being equal to the cleartext. \newline +Furter, by shifting every letter by the same amount, +the properties of the source language such as word spacing and letter frequencies are retained in the ciphertext, +leaving it vulnerable to simple attacks. + + +\paragraph{Transposition} is the process of reordering the plaintext to obtain a ciphertext. +Here, the key can be understood as instructions on how to re-order the ciphertext to obtain the original message. +The \textit{scytale} is one of the earliest implementations of a transposition cipher. + +\paragraph{Confusion and Diffusion} \cite{enwiki:confusion-diffusion} + +\section{DES}\label{sec:des} +The \acrfull{DES} is a symmetric (or private-key) cipher developed in the 1970s at IBM as an archetypal block cipher. +It takes in a block of 64 bits and transforms it to a ciphertext using a key of equal length. +Despite suspicions of backdoors engineered into the algorithm due to the involvement of the NSA in the development of \acrshort{DES}, +it was approved as a federal standard in the USA in 1976 and only retired due to its short key length, +for which the NSA however was directly responsible as well. \newline +Nevertheless, it sparked public and scientific interest in the research of encryption algorithms, producing a large body of publications. + +\section{AES} +The \acrfull{AES} superseded \acrshort{DES} in 2001 after an official selection process. +Unlike its predecessor, it does not use a Feistel network. + +\section{RSA} +\acrfull{RSA} is the first asymmetric (or public-key) cryptographic algorithm and can thus be used for encryption and digital signing. +It was named after its eponymous inventors in \citeyear{rsa} after trying to disprove the existence of \textit{trapdoor functions}, +a concept introduced by \citeauthor{diffiehellman} in their appropriately named pivotal paper \citetitle{diffiehellman}. + + +The algorithm they came up with relies on modular arithmetic, which remains the most popular class of asymmetric cryptography. + +\begin{enumerate} + \item Choose randomly and stochastically independet primes $p,q$ of similar size so that + $0.1 < | \log_2 p - \log_2 q | < 30 $. + \item Calculate $ N= p \cdot q $ + \item Compute Euler's totient function of $ \varphi (N) = (p-1) \cdot (q-1)$ which is kept secret. + \item Choose an integer $e$ so that $ 1 < e < \varphi (N) $ and $\gcd(e, \varphi(N)) =1$, i.e. $e$ and $\varphi(N)$ + are coprime. The most common choice here is $ e= 2^(16) +1 = 65537 $, as $e$ is released as part of the public key. + \item For the private key, % TODO +\end{enumerate} +\clearpage +%\printglossary[type=\acronymtype] +%\printglossary +\printbibliography +\end{document} diff --git a/entropy.bib b/entropy.bib new file mode 100644 index 0000000..b0dd565 --- /dev/null +++ b/entropy.bib @@ -0,0 +1,35 @@ + @misc{ enwiki:shannon-hartley, + author = "{Wikipedia contributors}", + title = "Shannon–Hartley theorem --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633", + note = "[Online; accessed 29-October-2025]" + } + @misc{ enwiki:noisy-channel, + author = "{Wikipedia contributors}", + title = "Noisy-channel coding theorem --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870", + note = "[Online; accessed 29-October-2025]" + } + @misc{ enwiki:source-coding, + author = "{Wikipedia contributors}", + title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440", + note = "[Online; accessed 29-October-2025]" + } + @misc{ dewiki:nyquist-shannon, + author = "Wikipedia", + title = "Nyquist-Shannon-Abtasttheorem --- Wikipedia{,} die freie Enzyklopädie", + year = "2025", + url = "https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066", + note = "[Online; Stand 29. Oktober 2025]" + } + @misc{ enwiki:information-content, + author = "{Wikipedia contributors}", + title = "Information content --- {Wikipedia}{,} The Free Encyclopedia", + year = "2025", + url = "https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600", + note = "[Online; accessed 29-October-2025]" + } diff --git a/entropy.tex b/entropy.tex new file mode 100644 index 0000000..ec3482a --- /dev/null +++ b/entropy.tex @@ -0,0 +1,368 @@ +\documentclass{article} +\usepackage[utf8x]{inputenc} +\usepackage[margin=1in]{geometry} % Adjust margins +\usepackage{caption} +\usepackage{wrapfig} +\usepackage{subcaption} +\usepackage{parskip} % dont indent after paragraphs, figures +\usepackage{xcolor} +%\usepackage{csquotes} % Recommended for biblatex +\usepackage{tikz} +\usepackage{pgfplots} +\usetikzlibrary{positioning} +\usepackage{float} +\usepackage{amsmath} +\PassOptionsToPackage{hyphens}{url} +\usepackage{hyperref} % allows urls to follow line breaks of text +\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex} +\addbibresource{entropy.bib} + + + + +\title{Entropy as a measure of information} +\date{\today} + +\begin{document} +\maketitle +\section{Introduction} +Across disciplines, entropy is a measure of uncertainty or randomness. +Originating in classical thermodynamics, +over time it has been applied in different sciences such as chemistry and information theory. +%As the informal concept of entropy gains popularity, its specific meaning can feel far-fetched and ambiguous. +The name 'entropy' was first coined by german physicist \textit{Rudolf Clausius} in 1865 +while postulating the second law of thermodynamics, one of 3(4) +laws of thermodynamics based on universal observation regarding heat and energy conversion. + +Specifically, the second law states that not all thermal energy can be converted into work in a cyclic process. +Or, in other words, that the entropy of an isolated system cannot decrease, +as they always tend toward a state of thermodynamic equilibrium where entropy is highest for a given internal energy. +Another result of this observation is the irreversibility of natural processes, also referred to as the \textit{arrow of time}. +Even though the first law (conservation of energy) allows for a cup falling off a table and breaking +as well as the reverse process of reassembling itself and jumping back onto the table, +the second law only allows the former and denies the latter, +requiring the state with higher entropy to occur later in time. + +Only 10 years later, in 1875, \textit{Ludwig Boltzmann} and \textit{Willard Gibbs} derived the formal definition +that is still in use in information theory today. +\begin{equation} +S = -k_B \sum_i p_i \ln(p_i) +\end{equation} +It gives statistical meaning to the macroscopic phenomenon of classical thermodynamics +by defining the entropy $S$ of a macrostate as +the result of probabilities $p_i$ of all its constituting micro states. +$k_B$ refers to the Boltzmann constant, which he himself did not determine but is part of todays SI system. + +\section{Shannon's axioms} +\textit{Claude Shannon} adapted the concept of entropy to information theory. +In an era of advancing communication technologies, the question he addressed was of increasing importance: +How can messages be encoded and transmitted efficiently? +He proposed 3(4) axioms a measure of information would have to comply with: +\begin{enumerate} + \item $I(1) = 0$: events that always occur do not communicate information. + \item $ I'(p) \leq 0$ is monotonically decreasing in p: an increase in the probability of an event + decreases the information from an observed event, and vice versa. + \item $I(p_1 \cdot p_2) = I(p_1) + I(p_2)$: the information learned from independent events + is the sum of the information learned from each event. + \item $I(p)$ is a twice continuously differentiable function of p. +\end{enumerate} + +As a measure, Shannon's formula uses the \textit{Bit}, quantifying the efficiency of codes +and media for transmission and storage. +In information theory, entropy can be understood as the expected information of a message. +\begin{equation} + H = E(I) = - \sum_i p_i \log_2(p_i) + \label{eq:entropy-information} +\end{equation} +This leaves $ I =log(1/p_i) = - log_2(p_i)$, implying that an unexpected message (low probability) carries +more information than one with higher probability. +Intuitively, we can imagine David A. Johnston, a volcanologist reporting day after day that there is no +activity on Mount St. Helens. After a while, we grow to expect this message because it is statistically very likely +that tomorrows message will be the same. When some day we get the message 'Vancouver! This is it!' it carries a lot of information +not only semantically (because it announces the eruption of a volcano) but statistically because it was very unlikely +given the transmission history. + +However, uncertainty (entropy) in this situation would be relatively low. +Because we attach high surprise only to the unlikely message of an eruption, the significantly more likely message +carries less information - we already expected it before it arrived. + +Putting the axioms and our intuitive understanding of information and uncertainty together, +we can see the logarithmic decay of information transported by a message as its probability increases in \autoref{fig:graph-information}, +as well as the entropy for a 2-event source given by solving \autoref{eq:entropy-information} for $i=2$, resulting in +$-p * \log_2(p) - (1-p) * \log_2(1-p) $. + +\begin{figure}[H] +\begin{minipage}{.5\textwidth} +\begin{tikzpicture} + \begin{axis}[ + domain=0:1, + samples=100, + axis lines=middle, + xlabel={$p$}, + ylabel={Information [bits]}, + xmin=0, xmax=1, + ymin=0, ymax=6.1, + grid=both, + width=8cm, + height=6cm, + every axis x label/.style={at={(current axis.right of origin)}, anchor=west}, + every axis y label/.style={at={(current axis.above origin)}, anchor=south}, + ] + \addplot[thick, blue] {-log2(x)}; + \end{axis} +\end{tikzpicture} +\caption{Information contained in a message depending on its probability $p$} +\label{fig:graph-information} +\end{minipage} +\begin{minipage}{.5\textwidth} +\begin{tikzpicture} + \begin{axis}[ + domain=0:1, + samples=100, + axis lines=middle, + xlabel={$p$}, + ylabel={Entropy [bits]}, + xmin=0, xmax=1, + ymin=0, ymax=1.1, + xtick={0,0.25,0.5,0.75,1}, + grid=both, + width=8cm, + height=6cm, + every axis x label/.style={at={(current axis.right of origin)}, anchor=west}, + every axis y label/.style={at={(current axis.above origin)}, anchor=south}, + ] + \addplot[thick, blue] {-x * log2(x) - (1-x) * log2(1-x)}; + \end{axis} +\end{tikzpicture} +\caption{Entropy of an event source with two possible events, depending on their probabilities $(p, 1-p)$} +\label{fig:graph-entropy} +\end{minipage} +\end{figure} + +The base 2 is chosen for the logarithm as our computers rely on a system of the same base, but theoretically +arbitrary bases can be used as they are proportional according to $\log_a b = \frac{\log_c b}{\log_c a} $. + +Further, the $\log_2$ can be intuitively understood for an event source with $2^n$ possible outcomes - +using standard binary coding, we can easily see that a message has to contain $\log_2(2^n) = n$ Bits +in order to be able to encode all possible outcomes. +For numbers where $a \neq 2^n$ such as $a=10$, it is easy to see that there exists a number $a^k = 2^n$ +which defines a message size that can encode the outcomes of $k$ event sources with $a$ outcomes each, +leaving the required Bits per event source at $\log_2(a^k) \div k = \log_2(a)$. + +%- bedingte Entropie +%- Redundanz +%- Quellentropie +\section{Applications} +\subsection{Decision Trees} +A decision tree is a supervised learning approach commonly used in machine learning. +The goal is to create an algorithm, i.e a series of questions to pose to new data (input variables) +in order to predict the target variable, a class label. +Graphically, each question can be visualized as a node in a tree, splitting the dataset into two or more groups. +This process is applied to the source set and then its resulting sets in a process called \textit{recursive partitioning}. +Once a leaf is reached, the class of the input has been successfully determined. + +In order to build the shallowest possible trees, we want to use input variables that minimize uncertainty. +While other measures for the best choice such as the \textit{Gini coefficient} exist, +entropy is a popular measure used in decision trees. + +Using what we learned about entropy, we want the maximum decrease in entropy of our target variable, +as explained in~\autoref{ex:decisiontree}. +\begin{figure}[H] +\centering +\begin{minipage}{.3\textwidth} +\begin{tabular}{c|c|c} + & hot & cold \\ + \hline + rain &4 &5 \\ + \hline + no rain & 3 & 2 \\ +\end{tabular} +\end{minipage} +\begin{minipage}{.6\textwidth} +When choosing rain as a target variable, the entropy prior to partitioning is $H_{prior} = H(\frac{9}{14},\frac{5}{14})$, +after partitioning by temperature (hot/cold)$H_{hot}= H(\frac{4}{7}, \frac{3}{7})$ +and $H_{cold}= H(\frac{5}{7}, \frac{2}{7})$ remain. +This leaves us with an expected entropy of +$p_{hot} * H_{hot} + p_{cold} * H_{cold} $ . +The \textbf{information gain} can then be calculated as the difference of entropy proor and post partitioning. +Since $H_{prior}$ is constant in this equation, it is sufficient to minimize post-partitioning $E[H]$. +\end{minipage} +\caption{Example of information gain in decision trees} +\label{ex:decisiontree} +\end{figure} + +Advantages of decision trees over other machine learning approaches include low computation cost and +interpretability, making it a popular choice for many applications. +However, drawbacks include overfitting and poor robustness, where minimal alterations to training data +can lead to a change in tree structure. + +\subsection{Cross-Entropy} +When dealing with two distributions, the \textit{cross-entropy}, also called Kullback-Leibler divergence +between a true distribution $p$ +and an estimated distribution $q$ is defined as: +\begin{equation} + H(p, q) = -\sum_x p(x) \log_2 q(x) +\end{equation} +The \textit{Kullback–Leibler divergence} measures how much information is lost when $q$ +is used to approximate $p$: +\begin{equation} + D_{KL}(p \| q) = H(p, q) - H(p) +\end{equation} +In machine learning, this term appears in many loss/cost functions — notably in classification problems +(cross-entropy loss) and in probabilistic models such as Variational Autoencoders (VAEs). +There, the true and predicted label are used as the true and estimated distribution, respectively. +In a supervised training example, the cross entropy loss degenerates to $-\log(p_{pred i})$ as the +true label vector is assumed to be the unit vector $e_i$ (one-hot). + +\subsection{Coding} +The concept of entropy also plays a crucial role in the design and evaluation of codes used for data compression and transmission. +In this context, \textit{coding} refers to the representation of symbols or messages +from a source using a finite set of codewords. +Each codeword is typically composed of a sequence of bits, +and the design goal is to minimize the average length of these codewords while maintaining unique decodability. + +According to Shannon's source coding theorem, the theoretical lower bound for the average codeword length of a source +is given by its entropy $H$. +In other words, no lossless coding scheme can achieve an average length smaller than the source entropy when expressed in bits. +Codes that approach this bound are called \textit{efficient} or \textit{entropy-optimal}. +A familiar example of such a scheme is \textit{Huffman coding}, +which assigns shorter codewords to more probable symbols and longer ones to less probable symbols, +resulting in a prefix-free code with minimal expected length. + +Beyond compression, coding is essential for reliable communication over imperfect channels. +In real-world systems, transmitted bits are often corrupted by noise, requiring mechanisms to detect and correct errors. +One simple but powerful concept to quantify the robustness of a code is the \textit{Hamming distance}. +The Hamming distance between two codewords is defined as the number of bit positions in which they differ. +For example, the codewords $10110$ and $11100$ have a Hamming distance of 2. + +A code with a minimum Hamming distance $d_{min}$ can detect up to $d_{min}-1$ errors +and correct up to $\lfloor (d_{min}-1)/2 \rfloor$ errors. +This insight forms the basis of error-correcting codes such as Hamming codes, +which add redundant bits to data in a structured way that enables the receiver to both identify and correct single-bit errors. + +Thus, the efficiency and reliability of communication systems are governed by a trade-off: +higher redundancy (lower efficiency) provides greater error correction capability, +while minimal redundancy maximizes data throughput but reduces error resilience. + +%Coding of a source of an information and communication channel +% https://www.youtube.com/watch?v=ErfnhcEV1O8 +% relation to hamming distance and efficient codes + +\subsection{Noisy communication channels} +The noisy channel coding theorem was stated by \textit{Claude Shannon} in 1948, but first rigorous proof was +provided in 1954 by Amiel Feinstein. +One of the important issues Shannon tackled with his 'Mathematical theory of commmunication' +was the insufficient means of transporting discrete data through a noisy channel that were more efficient than +the telegram - or, how to communicate reliably over an unreliable channel. +The means of error correction until then had been limited to very basic means. + +\begin{figure}[H] +\begin{tikzpicture} + \def\boxw{2.5cm} + \def\n{5} + \pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)} + % Draw the boxes + \node (A) at (0, 0) [draw, text width=\boxw, align=center] {Information Source}; + \node (B) at (\boxw + \gap, 0) [draw, text width=\boxw, align=center] {Transmitter}; + \node (C) at ({2*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Channel}; + \node (N) at ({2*(\boxw + \gap)}, -1) [draw, text width=\boxw, align=center] {Noise}; + \node (D) at ({3*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Receiver}; + \node (E) at ({4*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Destination}; + + % Draw arrows between the boxes + \draw[->] (A) -- (B); + \draw[->] (B) -- (C); + \draw[->] (C) -- (D); + \draw[->] (D) -- (E); + \draw[->] (N) -- (C); +\end{tikzpicture} +\caption{Model of a noisy communication channel} +\label{fig:noisy-channel} +\end{figure} +First, analogue connections like the first telephone lines, bypassed the issue altogether and relied +on the communicating parties and their brains' ability to filter human voices from the noise that was inevitably transmitted +along with the intended signal. +After some development, the telegraph in its final form used morse code, a series of long and short clicks, that, +together with letter and word gaps, would encode text messages. +Even though the long-short coding might appear similar to todays binary coding, the means of error correction were lacking. +For a long time, it relied on simply repeating the message multiple times, which is highly inefficient. +The destination would then have to determine the most likely intended message by performing a majority vote. +One might also propose simply increasing transmitting power, thereby decreasing the error rate of the associated channel. +However, the noisy channel coding theorem provides us with a more elegant solution. +It is of foundational importance to information theory, stating that given a noisy channel with capacity $C$ +and information transmitted at rate $R$, there exists an $R] (S) -- (S0); + \draw[->] (S) -- (S1); + + \draw[->,dashed] (S0) -- (D0) node[midway, above] {$1-p$}; + \draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$p$}; + \draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$p$}; + \draw[->,dashed] (S1) -- (D1) node[midway, below] {$1-p$}; + + \draw[->] (D0) -- (D); + \draw[->] (D1) -- (D); +\end{tikzpicture} +\caption{Binary symmetric channel with crossover probability $p$} +\label{fig:binary-channel} +\end{figure} + +The capacity of the binary symmetric channel is given by: +\begin{equation} + C = 1 - H_2(p) +\end{equation} +where $H_2(p) = -p \log_2(p) - (1-p)\log_2(1-p)$ is the binary entropy function. +As $p$ increases, uncertainty grows and channel capacity declines. +When $p = 0.5$, output bits are completely random and no information can be transmitted ($C = 0$). +As illustrated in \autoref{fig:graph-entropy}, an error rate over $p > 0.5$ is equivalent to $ 1-p < 0.5$, +though not relevant in practice. + +Shannon’s theorem is not constructive as it does not provide an explicit method for constructing such efficient codes, +but it guarantees their existence. +In practice, structured codes such as Hamming and Reed–Solomon codes are employed to approach channel capacity. + +\section{Conclusion} +Entropy provides a fundamental measure of uncertainty and information, +bridging concepts from thermodynamics to modern communication theory. + +Beyond the provided examples, the concept of entropy has far-reaching applications in diverse fields: +from cryptography, where it quantifies randomness and security, +to statistical physics, where it characterizes disorder in complex systems, +to biology, connecting molecular information and population diversity. + +\printbibliography + +\end{document}