update

2025-11-30 16:38:04 +01:00 · 2025-11-30 15:24:08 +01:00 · 2025-11-28 12:57:59 +01:00 · 2025-11-26 23:27:47 +01:00 · 2025-11-26 19:24:53 +01:00 · 2025-11-26 18:40:11 +01:00
8 changed files with 815 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -332,3 +332,4 @@ TSWLatexianTemp*

 # End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode
 bib
+*.*-SAVE-ERROR
--- a/.latexmkrc
+++ b/.latexmkrc
@@ -0,0 +1,5 @@
+$latex = 'latex  %O  --shell-escape %S';
+$pdflatex = 'pdflatex  %O  --shell-escape %S';
+$pdf_mode = 1;
+$clean_ext = "lol nav snm loa bbl*";
+$bibtex_use = 2;
--- a/compression.bib
+++ b/compression.bib
@@ -0,0 +1,94 @@
+@article{shannon1948mathematical,
+  title={A mathematical theory of communication},
+  author={Shannon, Claude E},
+  journal={The Bell system technical journal},
+  volume={27},
+  number={3},
+  pages={379--423},
+  year={1948},
+  publisher={Nokia Bell Labs}
+}
+@misc{ enwiki:shannon-source-coding,
+  author = "{Wikipedia contributors}",
+  title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
+  year = "2025",
+  url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440",
+  note = "[Online; accessed 25-November-2025]"
+}
+@misc{ enwiki:shannon-fano,
+  author = "{Wikipedia contributors}",
+  title = "Shannon–Fano coding --- {Wikipedia}{,} The Free Encyclopedia",
+  year = "2025",
+  url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Fano_coding&oldid=1315776380",
+  note = "[Online; accessed 26-November-2025]"
+}
+@misc{ enwiki:huffman-code,
+  author = "{Wikipedia contributors}",
+  title = "Huffman coding --- {Wikipedia}{,} The Free Encyclopedia",
+  year = "2025",
+  url = "https://en.wikipedia.org/w/index.php?title=Huffman_coding&oldid=1321991625",
+  note = "[Online; accessed 26-November-2025]"
+}
+@misc{ enwiki:lzw,
+  author = "{Wikipedia contributors}",
+  title = "Lempel–Ziv–Welch --- {Wikipedia}{,} The Free Encyclopedia",
+  year = "2025",
+  url = "https://en.wikipedia.org/w/index.php?title=Lempel%E2%80%93Ziv%E2%80%93Welch&oldid=1307959679",
+  note = "[Online; accessed 26-November-2025]"
+}
+@misc{ enwiki:arithmetic-code,
+  author = "{Wikipedia contributors}",
+  title = "Arithmetic coding --- {Wikipedia}{,} The Free Encyclopedia",
+  year = "2025",
+  url = "https://en.wikipedia.org/w/index.php?title=Arithmetic_coding&oldid=1320999535",
+  note = "[Online; accessed 26-November-2025]"
+}
+@misc{ enwiki:kraft-mcmillan,
+  author = "{Wikipedia contributors}",
+  title = "Kraft–McMillan inequality --- {Wikipedia}{,} The Free Encyclopedia",
+  year = "2025",
+  url = "https://en.wikipedia.org/w/index.php?title=Kraft%E2%80%93McMillan_inequality&oldid=1313803157",
+  note = "[Online; accessed 26-November-2025]"
+}
+@misc{ enwiki:partition,
+  author = "{Wikipedia contributors}",
+  title = "Partition problem --- {Wikipedia}{,} The Free Encyclopedia",
+  year = "2025",
+  url = "https://en.wikipedia.org/w/index.php?title=Partition_problem&oldid=1320732818",
+  note = "[Online; accessed 30-November-2025]"
+}
+@misc{ dewiki:shannon-fano,
+ author = "Wikipedia",
+ title = "Shannon-Fano-Kodierung --- Wikipedia{,} die freie Enzyklopädie",
+ year = "2024",
+ url = "https://de.wikipedia.org/w/index.php?title=Shannon-Fano-Kodierung&oldid=246624798",
+ note = "[Online; Stand 26. November 2025]"
+}
+@misc{ dewiki:huffman-code,
+ author = "Wikipedia",
+ title = "Huffman-Kodierung --- Wikipedia{,} die freie Enzyklopädie",
+ year = "2025",
+ url = "https://de.wikipedia.org/w/index.php?title=Huffman-Kodierung&oldid=254369306",
+ note = "[Online; Stand 26. November 2025]"
+}
+@misc{ dewiki:lzw,
+ author = "Wikipedia",
+ title = "Lempel-Ziv-Welch-Algorithmus --- Wikipedia{,} die freie Enzyklopädie",
+ year = "2025",
+ url = "https://de.wikipedia.org/w/index.php?title=Lempel-Ziv-Welch-Algorithmus&oldid=251943809",
+ note = "[Online; Stand 26. November 2025]"
+}
+@misc{ dewiki:kraft-mcmillan,
+ author = "Wikipedia",
+ title = "Kraft-Ungleichung --- Wikipedia{,} die freie Enzyklopädie",
+ year = "2018",
+ url = "https://de.wikipedia.org/w/index.php?title=Kraft-Ungleichung&oldid=172862410",
+ note = "[Online; Stand 26. November 2025]"
+}
+@misc{ dewiki:partition,
+ author = "Wikipedia",
+ title = "Partitionsproblem --- Wikipedia{,} die freie Enzyklopädie",
+ year = "2025",
+ url = "https://de.wikipedia.org/w/index.php?title=Partitionsproblem&oldid=255787013",
+ note = "[Online; Stand 26. November 2025]"
+}
--- a/compression.tex
+++ b/compression.tex
@@ -0,0 +1,333 @@
+\documentclass{article}
+%%% basic layouting
+\usepackage[utf8x]{inputenc}
+\usepackage[margin=1in]{geometry} % Adjust margins
+\usepackage{caption}
+\usepackage{hyperref}
+\PassOptionsToPackage{hyphens}{url} % allow breaking urls
+\usepackage{float}
+\usepackage{wrapfig}
+\usepackage{subcaption}
+\usepackage{parskip} % dont indent after paragraphs, figures
+\usepackage{xcolor}
+%%% algorithms
+\usepackage{algorithm}
+\usepackage{algpseudocodex}
+% graphs and plots
+\usepackage{tikz}
+\usepackage{pgfplots}
+\usetikzlibrary{positioning}
+\usetikzlibrary{trees}
+%\usetikzlibrary{graphs, graphdrawing}
+%%% math
+\usepackage{amsmath}
+%%% citations
+\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex}
+%\usepackage{csquotes} % Recommended for biblatex
+\addbibresource{compression.bib}
+
+\title{Compression}
+\author{Erik Neller}
+\date{\today}
+
+\begin{document}
+\maketitle
+\section{Introduction}
+As the volume of data grows exponentially around the world, compression is only gaining in importance to all disciplines.
+Not only does it enable the storage of large amounts of information needed for research in scientific domains
+like DNA sequencing and analysis, it also plays a vital role in keeping stored data accessible by
+facilitating cataloging, search and retrieval.
+The concept of entropy introduced in the previous entry is closely related to the design of efficient codes for compression.
+In coding theory, the events of an information source are to be encoded in a manner that minimizes the bits needed to store
+the information provided by the source.
+The process of encoding can thus be described by a function $C$ transforming from a source alphabet $X$ to a code alphabet $Y$.
+Symbols in the alphabets are denominated $x_i$ and $y_j$ respectively, and have underlying probabilities $p_{i}$.
+% TODO fix use of alphabet / symbol / code word: alphabet is usually binary -> code word is 010101
+\begin{equation}
+    C: X \rightarrow Y \qquad X=\{x_1,x_2,...x_n\} \qquad Y=\{y_1,y_2,...y_m\}
+    \label{eq:formal-code}
+\end{equation}
+
+The understanding of entropy as the expected information $E(I)$ of a message provides an intuition that,
+given a source with a given entropy (in bits), any coding can not have a lower average word length $l_j$ (in bits)
+than this entropy without losing information.
+\begin{equation}
+    H = E(I) = - \sum_i p_i \log_2(p_i) \quad \leq \quad E(L) = \sum_i p_j l_j
+    \label{eq:entropy-information}
+\end{equation}
+This is the content of Shannons's source coding theorem,
+introduced in \citeyear{shannon1948mathematical}.
+In his paper, \citeauthor{shannon1948mathematical} proposed two principal ideas to minimize the average length of a code.
+The first is to use short codes for symbols with higher probability.
+This is an intuitive approach as more frequent symbols have a higher impact on average code length.
+The second idea is to encode events that frequently occur together at the same time, artificially increasing
+the size of the code alphabet $Y$ to allow for greater flexibility in code design.\cite{enwiki:shannon-source-coding}
+
+Codes can have several properties. A code where all codewords have equal lengths is called a \textit{block code}.
+While easy to construct, they are not well suited for our goal of minimizing average word length
+as specified in \autoref{eq:entropy-information} because the source alphabet is generally not equally distributed
+in a way that $p_i = \frac{1}{n}$.
+
+In order to send (or store, for that matter) multiple code words in succession, a code $Y$ has to be uniquely decodable.
+When receiving 0010 in succesion using the nonsingular code $Y_2$ from \autoref{tab:code-properties},
+it is not clear to the recipient which source symbols make up the intended message.
+For the specified sequence, there are a total of three possibilities to decode the received code:
+$s_0 s_3 s_0$, $s_0 s_0 s_1$ or $s_2 s_1$ could all be the intended message, making the code useless.
+
+\begin{table}[H]
+\centering
+\begin{tabular}{c l l l}
+    Source Code $X$ & Prefix Code $Y_0$ & Suffix Code $Y_1$ & Nonsingular Code $Y_2$ \\
+    \hline
+    $s_0$ & 0       & 0     & 0   \\
+    $s_1$ & 10      & 01    & 10  \\
+    $s_2$ & 110     & 011   & 00  \\
+    $s_3$ & 1110    & 0111  & 01  \\
+\end{tabular}
+\caption{Examples of different properties of codes}
+\label{tab:code-properties}
+\end{table}
+Another interesting property of a code that is specifically important for transmission but less so for storage, is
+being prefix-free.
+A prefix code (which is said to be prefix-free) can be decoded by the receiver of the symbol as soon as it is received
+because no code word $y_j$ is the prefix of another valid code word.
+As shown in \autoref{tab:code-properties} $Y_0$ is a prefix code, in this case more specifically called a \textit{comma code}
+because each code word is separated by a trailing 0 from the next code word.
+$Y_1$ in contrast is called a \textit{capital code} (capitalizes the beginning of each word) and is not a prefix code.
+In the case of the capital code in fact every word other than the longest possible code word is a prefix of the longer words
+lower in the table. As a result, the receiver cannot instantaneously decode each word but rather has to wait for the leading 0
+of the next codeword.
+
+
+Further, a code is said to be \textit{efficient} if it has the smallest possible average word length, i.e. matches
+the entropy of the source alphabet.
+
+\section{Kraft-McMillan inequality}
+The Kraft-McMillan inequality gives a necessary and sufficient condition for the existence of a prefix code.
+In the form shown in \autoref{eq:kraft-mcmillan} it is intuitive to understand given a code tree.
+Because prefix codes require code words to only be situated on the leaves of a code tree,
+for every code word $i$ using an alphabet of size $r$, it uses up exactly $r^{-l_i}$ of the available code words.
+The sum over all of them can thus never be larger than one else
+the code will not be uniquely decodable \cite{enwiki:kraft-mcmillan}.
+\begin{equation}
+    \sum_l r^{-l_i} \leq 1
+    \label{eq:kraft-mcmillan}
+\end{equation}
+
+\section{Shannon-Fano}
+Shannon-Fano coding is one of the earliest methods for constructing prefix codes.
+It is a top-down method that divides symbols into equal groups based on their probabilities,
+recursively partitioning them to assign shorter codewords to more frequent events.
+
+\begin{algorithm}
+\caption{Shannon-Fano compression}
+\label{alg:shannon-fano}
+\begin{algorithmic}
+    \Procedure{ShannonFano}{symbols, probabilities}
+        \If{length(symbols) $= 1$}
+            \State \Return codeword for single symbol
+        \EndIf
+        \State $\text{current\_sum} \gets 0$
+        \State $\text{split\_index} \gets 0$
+        \For{$i \gets 1$ \textbf{to} length(symbols)}
+            \If{$|\text{current\_sum} + \text{probabilities}[i] - 0.5| < |\text{current\_sum} - 0.5|$}
+                \State $\text{current\_sum} \gets \text{current\_sum} + \text{probabilities}[i]$
+                \State $\text{split\_index} \gets i$
+            \EndIf
+        \EndFor
+        \State $\text{left\_group} \gets \text{symbols}[1 : \text{split\_index}]$
+        \State $\text{right\_group} \gets \text{symbols}[\text{split\_index} + 1 : \text{length(symbols)}]$
+        \State Assign prefix ``0'' to codes from ShannonFano($\text{left\_group}, \ldots$)
+        \State Assign prefix ``1'' to codes from ShannonFano($\text{right\_group}, \ldots$)
+    \EndProcedure
+\end{algorithmic}
+\end{algorithm}
+
+While Shannon-Fano coding guarantees the generation of a prefix-free code with an average word length close to the entropy,
+it is not guaranteed to be optimal. In practice, it often generates codewords that are only slightly longer than necessary.
+Weaknesses of the algorithm also include the non-trivial partitioning phase \cite{enwiki:partition}, which can in practice however be solved relatively efficiently.
+Due to the aforementioned limitations, neither of the two historically slightly ambiguous Shannon-Fano algorithms are almost never used,
+in favor of the Huffman coding as described in the next section.
+
+\section{Huffman Coding}
+\label{sec:huffman}
+Huffman coding is an optimal prefix coding algorithm that minimizes the expected codeword length
+for a given set of symbol probabilities.  Developed by David Huffman in 1952, it guarantees optimality
+by constructing a binary tree where the most frequent symbols are assigned the shortest codewords.
+Huffman coding achieves the theoretical limit of entropy for discrete memoryless sources,
+making it one of the most important compression techniques in information theory.
+
+Unlike Shannon-Fano, which uses a top-down approach, Huffman coding employs a bottom-up strategy.
+The algorithm builds the code tree by iteratively combining the two symbols with the lowest probabilities
+into a new internal node.  This greedy approach ensures that the resulting tree minimizes the weighted path length,
+where the weight of each symbol is its probability.
+
+\begin{algorithm}
+\caption{Huffman coding algorithm}
+\label{alg:huffman}
+\begin{algorithmic}
+    \Procedure{Huffman}{symbols, probabilities}
+        \State Create a leaf node for each symbol and add it to a priority queue
+        \While{priority queue contains more than one node}
+            \State Extract two nodes with minimum frequency: $\text{left}$ and $\text{right}$
+            \State Create a new internal node with frequency $\text{freq(left)} + \text{freq(right)}$
+            \State Set $\text{left}$ as the left child and $\text{right}$ as the right child
+            \State Add the new internal node to the priority queue
+        \EndWhile
+        \State $\text{root} \gets$ remaining node in priority queue
+        \State Traverse tree and assign codewords: ``0'' for left edges, ``1'' for right edges
+        \State \Return codewords
+    \EndProcedure
+\end{algorithmic}
+\end{algorithm}
+
+The optimality of Huffman coding can be proven by exchange arguments.
+The key insight is that if two codewords have the maximum length in an optimal code, they must correspond to the two least frequent symbols.
+Moreover, these two symbols can be combined into a single meta-symbol without affecting optimality,
+which leads to a recursive structure that guarantees Huffman's method produces an optimal code.
+
+The average codeword length $L_{\text{Huffman}}$ produced by Huffman coding satisfies the following bounds:
+\begin{equation}
+    H(X) \leq L_{\text{Huffman}} < H(X) + 1
+    \label{eq:huffman-bounds}
+\end{equation}
+where $H(X)$ is the entropy of the source.  This means Huffman coding is guaranteed to be within one bit
+of the theoretical optimum.  In practice, when symbol probabilities are powers of $\frac{1}{2}$,
+Huffman coding achieves perfect compression and $L_{\text{Huffman}} = H(X)$.
+
+The computational complexity of Huffman coding is $O(n \log n)$, where $n$ is the number of distinct symbols.
+A priority queue implementation using a binary heap achieves this bound, making Huffman coding
+efficient even for large alphabets.  Its widespread use in compression formats such as DEFLATE, JPEG, and MP3
+testifies to its practical importance.
+
+However, Huffman coding has limitations. First, it requires knowledge of the probability distribution
+of symbols before encoding, necessitating a preprocessing pass or transmission of frequency tables.
+Second, it assigns an integer number of bits to each symbol, which can be suboptimal
+when symbol probabilities do not align well with powers of two.
+Symbol-by-symbol coding imposes a constraint that is often unneeded since codes will usually be packed in long sequences,
+leaving room for further optimization as provided by Arithmetic Coding.
+
+\section{Arithmetic Coding}
+Arithmetic coding is a modern compression technique that encodes an entire message as a single interval
+within the range $[0, 1)$, as opposed to symbol-by-symbol coding used by Huffman.
+By iteratively refining this interval based on the probabilities of the symbols in the message,
+arithmetic coding can achieve compression rates that approach the entropy of the source.
+Its ability to handle non-integer bit lengths makes it particularly powerful
+for applications requiring high compression efficiency.
+
+In the basic form, a message is first written in the base of the alphabet with a leading '$0.$': $ \text{ABBCAB} = 0.011201_3$,
+in this case yielding a ternary number as the alphabet is $ |\{A,B,C\}| = 3 $.
+This number can then be encoded to the target base (usually 2) with sufficient precision to yield back the original number, resulting in $0.0010110001_2$.
+The decoder only gets the rational number $q$ and the length $n$ of the original message.
+The encoding can then be easily reversed by changing base and rounding to $n$ digits.
+
+In general, arithmetic coding can produce near-optimal output for any given source probability distribution.
+This is achieved by adjusting the intervals that are interpreted as a given source symbol.
+Given the following source probabilities of $p_A = \frac{6}{8}, p_B = p_C = \frac{1}{8}$ the intervals would be adjusted to
+$ A= [0,\frac{6}{8}), B=(\frac{6}{8}, \frac{7}{8}), C=(\frac{7}{8},1]$.
+Instead of transforming the base of the number and rounding to appropriate precision, the encoder recursively refines the interval and in the end chooses a number inside that interval.
+\begin{enumerate}
+    \item \textbf{Symbol:A} $A=[0, \frac{6}{8})$
+    \item $ A= [0,(\frac{6}{8})^2), B=((\frac{6}{8})^2, \frac{7}{8} \cdot \frac{6}{8}), C=(\frac{7}{8} \cdot \frac{6}{8},1 \cdot \frac{6}{8}]$.
+    \item \textbf{Symbol:B} $B=((\frac{6}{8})^2, \frac{7}{8} \cdot \frac{6}{8}) = (\frac{36}{64}, \frac{42}{64})$
+\end{enumerate}
+
+Depending on implementation, the source message can also be encoded in base $n+1$, reserving room for a special \verb|END-OF-DATA| symbol that the decoder
+will look for and consequently stop reading from the input $q$.
+
+\section{LZW Algorithm}
+The Lempel-Ziv-Welch (LZW) algorithm is a dictionary-based compression method that dynamically builds a dictionary
+of recurring patterns in the data as compression proceeds. Unlike entropy-based methods such as Huffman or arithmetic coding,
+LZW does not require prior knowledge of symbol probabilities, making it highly adaptable and efficient
+for a wide range of applications, including image and text compression.
+The algorithm was developed by Abraham Lempel and Jacob Ziv, with refinements by Terry Welch in 1984.
+
+The fundamental insight of LZW is that many data sources contain repeating patterns that can be exploited
+by replacing longer sequences with shorter codes. Rather than assigning variable-length codes to individual symbols
+based on their frequency, LZW identifies recurring substrings and assigns them fixed-length codes.
+As the algorithm processes the data, it dynamically constructs a dictionary that maps these patterns to codes,
+without requiring the dictionary to be transmitted with the compressed data.
+
+\begin{algorithm}
+\caption{LZW compression algorithm}
+\label{alg:lzw}
+\begin{algorithmic}
+    \Procedure{LZWCompress}{data}
+        \State Initialize dictionary with all single characters
+        \State $\text{code} \gets$ next available code (typically 256 for byte alphabet)
+        \State $w \gets$ first symbol from data
+        \State $\text{output} \gets [\,]$
+        \For{each symbol $c$ in remaining data}
+            \If{$w + c$ exists in dictionary}
+                \State $w \gets w + c$
+            \Else
+                \State append $\text{code}(w)$ to output
+                \If{code $<$ max\_code}
+                    \State Add $w + c$ to dictionary with code $\text{code}$
+                    \State $\text{code} \gets \text{code} + 1$
+                \EndIf
+                \State $w \gets c$
+            \EndIf
+        \EndFor
+        \State append $\text{code}(w)$ to output
+        \State \Return output
+    \EndProcedure
+\end{algorithmic}
+\end{algorithm}
+
+The decompression process is equally elegant. The decompressor initializes an identical dictionary
+and reconstructs the original data by decoding the transmitted codes.  Crucially, the decompressor
+can reconstruct the dictionary entries on-the-fly as it processes the compressed data,
+recovering the exact sequence of dictionary updates that occurred during compression.
+This property is what allows the dictionary to remain implicit rather than explicitly transmitted.
+
+\begin{algorithm}
+\caption{LZW decompression algorithm}
+\label{alg:lzw-decompress}
+\begin{algorithmic}
+    \Procedure{LZWDecompress}{codes}
+        \State Initialize dictionary with all single characters
+        \State $\text{code} \gets$ next available code
+        \State $w \gets \text{decode}(\text{codes}[0])$
+        \State $\text{output} \gets w$
+        \For{each code $c$ in $\text{codes}[1:]$}
+            \If{$c$ exists in dictionary}
+                \State $k \gets \text{decode}(c)$
+            \Else
+                \State $k \gets w + w[0]$ \quad \{handle special case\}
+            \EndIf
+            \State append $k$ to output
+            \State Add $w + k[0]$ to dictionary with code $\text{code}$
+            \State $\text{code} \gets \text{code} + 1$
+            \State $w \gets k$
+        \EndFor
+        \State \Return output
+    \EndProcedure
+\end{algorithmic}
+\end{algorithm}
+
+LZW's advantages make it particularly valuable for certain applications. First, it requires no statistical modeling
+of the input data, making it applicable to diverse data types without prior analysis.
+Second, the dictionary is built incrementally and implicitly, eliminating transmission overhead.
+Third, it can achieve significant compression on data with repeating patterns, such as text, images, and structured data.
+Fourth, the algorithm is relatively simple to implement and computationally efficient, with time complexity $O(n)$
+where $n$ is the length of the input.
+
+However, LZW has notable limitations. Its compression effectiveness is highly dependent on the structure and repetitiveness
+of the input data. On truly random data with no repeating patterns, LZW can even increase the file size.
+Additionally, the fixed size of the dictionary (typically 12 or 16 bits, allowing $2^12=4096$ or $2^16=65536$ entries)
+limits its ability to adapt to arbitrarily large vocabularies of patterns.
+When the dictionary becomes full, most implementations stop adding new entries, potentially reducing compression efficiency.
+
+LZW has seen widespread practical deployment in compression standards and applications.
+The GIF image format uses LZW compression, as does the TIFF image format in some variants.
+
+The relationship between dictionary-based methods like LZW and entropy-based methods like Huffman
+is complementary rather than competitive. LZW excels at capturing structure and repetition,
+while entropy-based methods optimize symbol encoding based on probability distributions.
+This has led to hybrid approaches that combine both techniques, such as the Deflate algorithm,
+which uses LZSS (a variant of LZ77) followed by Huffman coding of the output to achieve better compression ratios.
+
+
+\printbibliography
+\end{document}
--- a/correction.tex
+++ b/correction.tex
@@ -0,0 +1,2 @@
+
+Error correction codes (theorems: Hamming condition, Varsham-Gilbert)
--- a/crypto.tex
+++ b/crypto.tex
@@ -0,0 +1,2 @@
+
+Cryptography: DES, AES and RSA
--- a/entropy.bib
+++ b/entropy.bib
@@ -0,0 +1,35 @@
+  @misc{ enwiki:shannon-hartley,
+    author = "{Wikipedia contributors}",
+    title = "Shannon–Hartley theorem --- {Wikipedia}{,} The Free Encyclopedia",
+    year = "2025",
+    url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633",
+    note = "[Online; accessed 29-October-2025]"
+  }
+  @misc{ enwiki:noisy-channel,
+    author = "{Wikipedia contributors}",
+    title = "Noisy-channel coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
+    year = "2025",
+    url = "https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870",
+    note = "[Online; accessed 29-October-2025]"
+  }
+  @misc{ enwiki:source-coding,
+    author = "{Wikipedia contributors}",
+    title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
+    year = "2025",
+    url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440",
+    note = "[Online; accessed 29-October-2025]"
+  }
+ @misc{ dewiki:nyquist-shannon,
+   author = "Wikipedia",
+   title = "Nyquist-Shannon-Abtasttheorem --- Wikipedia{,} die freie Enzyklopädie",
+   year = "2025",
+   url = "https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066",
+   note = "[Online; Stand 29. Oktober 2025]"
+ }
+  @misc{ enwiki:information-content,
+    author = "{Wikipedia contributors}",
+    title = "Information content --- {Wikipedia}{,} The Free Encyclopedia",
+    year = "2025",
+    url = "https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600",
+    note = "[Online; accessed 29-October-2025]"
+  }
--- a/entropy.tex
+++ b/entropy.tex
@@ -2,13 +2,20 @@
 \usepackage[utf8x]{inputenc}
 \usepackage[margin=1in]{geometry} % Adjust margins
 \usepackage{caption}
+\usepackage{wrapfig}
 \usepackage{subcaption}
 \usepackage{parskip} % dont indent after paragraphs, figures
 \usepackage{xcolor}
+%\usepackage{csquotes} % Recommended for biblatex
 \usepackage{tikz}
+\usepackage{pgfplots}
+\usetikzlibrary{positioning}
 \usepackage{float}
 \usepackage{amsmath}
-\PassOptionsToPackage{hyphens}{url}\usepackage{hyperref} % allows urls to follow line breaks of text
+\PassOptionsToPackage{hyphens}{url}
+\usepackage{hyperref} % allows urls to follow line breaks of text
+\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex}
+\addbibresource{entropy.bib}



@@ -19,13 +26,344 @@

 \begin{document}
 \maketitle
-\section{What is entropy?}
-\section{Definitions across disciplines}
+\section{Introduction}
+Across disciplines, entropy is a measure of uncertainty or randomness.
+Originating in classical thermodynamics,
+over time it has been applied in different sciences such as chemistry and information theory.
+%As the informal concept of entropy gains popularity, its specific meaning can feel far-fetched and ambiguous.
+The name 'entropy' was first coined by german physicist \textit{Rudolf Clausius} in 1865
+while postulating the second law of thermodynamics, one of 3(4)
+laws of thermodynamics based on universal observation regarding heat and energy conversion.
+
+Specifically, the second law states that not all thermal energy can be converted into work in a cyclic process.
+Or, in other words, that the entropy of an isolated system cannot decrease,
+as they always tend toward a state of thermodynamic equilibrium where entropy is highest for a given internal energy.
+Another result of this observation is the irreversibility of natural processes, also referred to as the \textit{arrow of time}.
+Even though the first law (conservation of energy) allows for a cup falling off a table and breaking
+as well as the reverse process of reassembling itself and jumping back onto the table,
+the second law only allows the former and denies the latter,
+requiring the state with higher entropy to occur later in time.
+
+Only 10 years later, in 1875, \textit{Ludwig Boltzmann} and \textit{Willard Gibbs} derived the formal definition
+that is still in use in information theory today.
+\begin{equation}
+S = -k_B \sum_i p_i \ln(p_i)
+\end{equation}
+It gives statistical meaning to the macroscopic phenomenon of classical thermodynamics 
+by defining the entropy $S$ of a macrostate as 
+the result of probabilities $p_i$ of all its constituting micro states.
+$k_B$ refers to the Boltzmann constant, which he himself did not determine but is part of todays SI system.
+
+\section{Shannon's axioms}
+\textit{Claude Shannon} adapted the concept of entropy to information theory.
+In an era of advancing communication technologies, the question he addressed was of increasing importance:
+How can messages be encoded and transmitted efficiently?
+He proposed 3(4) axioms a measure of information would have to comply with:
+\begin{enumerate}
+    \item $I(1) = 0$: events that always occur do not communicate information.
+    \item $ I'(p) \leq 0$ is monotonically decreasing in p: an increase in the probability of an event
+        decreases the information from an observed event, and vice versa.
+    \item $I(p_1 \cdot p_2) = I(p_1) + I(p_2)$: the information learned from independent events
+        is the sum of the information learned from each event.
+    \item $I(p)$ is a twice continuously differentiable function of p.
+\end{enumerate}
+
+As a measure, Shannon's formula uses the \textit{Bit}, quantifying the efficiency of codes
+and media for transmission and storage.
+In information theory, entropy can be understood as the expected information of a message.
+\begin{equation}
+    H = E(I) = - \sum_i p_i \log_2(p_i)
+    \label{eq:entropy-information}
+\end{equation}
+This leaves $ I =log(1/p_i) = - log_2(p_i)$, implying that an unexpected message (low probability) carries
+more information than one with higher probability.
+Intuitively, we can imagine David A. Johnston, a volcanologist reporting day after day that there is no
+activity on Mount St. Helens. After a while, we grow to expect this message because it is statistically very likely
+that tomorrows message will be the same. When some day we get the message 'Vancouver! This is it!' it carries a lot of information
+not only semantically (because it announces the eruption of a volcano) but statistically because it was very unlikely
+given the transmission history.
+
+However, uncertainty (entropy) in this situation would be relatively low.
+Because we attach high surprise only to the unlikely message of an eruption, the significantly more likely message
+carries less information - we already expected it before it arrived.
+
+Putting the axioms and our intuitive understanding of information and uncertainty together,
+we can see the logarithmic decay of information transported by a message as its probability increases in \autoref{fig:graph-information},
+as well as the entropy for a 2-event source given by solving \autoref{eq:entropy-information} for $i=2$, resulting in
+$-p * \log_2(p) - (1-p) * \log_2(1-p) $.
+
+\begin{figure}[H]
+\begin{minipage}{.5\textwidth}
+\begin{tikzpicture}
+  \begin{axis}[
+    domain=0:1,
+    samples=100,
+    axis lines=middle,
+    xlabel={$p$},
+      ylabel={Information [bits]},
+    xmin=0, xmax=1,
+    ymin=0, ymax=6.1,
+    grid=both,
+    width=8cm,
+    height=6cm,
+    every axis x label/.style={at={(current axis.right of origin)}, anchor=west},
+    every axis y label/.style={at={(current axis.above origin)}, anchor=south},
+  ]
+    \addplot[thick, blue] {-log2(x)};
+  \end{axis}
+\end{tikzpicture}
+\caption{Information contained in a message depending on its probability $p$}
+\label{fig:graph-information}
+\end{minipage}
+\begin{minipage}{.5\textwidth}
+\begin{tikzpicture}
+  \begin{axis}[
+    domain=0:1,
+    samples=100,
+    axis lines=middle,
+    xlabel={$p$},
+    ylabel={Entropy [bits]},
+    xmin=0, xmax=1,
+    ymin=0, ymax=1.1,
+    xtick={0,0.25,0.5,0.75,1},
+    grid=both,
+    width=8cm,
+    height=6cm,
+    every axis x label/.style={at={(current axis.right of origin)}, anchor=west},
+    every axis y label/.style={at={(current axis.above origin)}, anchor=south},
+  ]
+    \addplot[thick, blue] {-x * log2(x) - (1-x) * log2(1-x)};
+  \end{axis}
+\end{tikzpicture}
+\caption{Entropy of an event source with two possible events, depending on their probabilities $(p, 1-p)$}
+\label{fig:graph-entropy}
+\end{minipage}
+\end{figure}
+
+The base 2 is chosen for the logarithm as our computers rely on a system of the same base, but theoretically
+arbitrary bases can be used as they are proportional according to $\log_a b = \frac{\log_c b}{\log_c a} $.
+
+Further, the $\log_2$ can be intuitively understood for an event source with $2^n$ possible outcomes -
+using standard binary coding, we can easily see that a message has to contain $\log_2(2^n) = n$ Bits
+in order to be able to encode all possible outcomes.
+For numbers where $a \neq 2^n$ such as $a=10$, it is easy to see that there exists a number $a^k = 2^n$
+which defines a message size that can encode the outcomes of $k$ event sources with $a$ outcomes each,
+leaving the required Bits per event source at $\log_2(a^k) \div k = \log_2(a)$.
+
 %- bedingte Entropie
 %- Redundanz
 %- Quellentropie
-\section{Shannon Axioms}
-\section{Coding of a source of an information and communication channel}
+\section{Applications}
+\subsection{Decision Trees}
+A decision tree is a supervised learning approach commonly used in machine learning.
+The goal is to create an algorithm, i.e a series of questions to pose to new data (input variables)
+in order to predict the target variable, a class label.
+Graphically, each question can be visualized as a node in a tree, splitting the dataset into two or more groups.
+This process is applied to the source set and then its resulting sets in a process called \textit{recursive partitioning}.
+Once a leaf is reached, the class of the input has been successfully determined.

+In order to build the shallowest possible trees, we want to use input variables that minimize uncertainty.
+While other measures for the best choice such as the \textit{Gini coefficient} exist,
+entropy is a popular measure used in decision trees.
+
+Using what we learned about entropy, we want the maximum decrease in entropy of our target variable,
+as explained in~\autoref{ex:decisiontree}.
+\begin{figure}[H]
+\centering
+\begin{minipage}{.3\textwidth}
+\begin{tabular}{c|c|c}
+    & hot & cold \\
+    \hline
+    rain &4  &5 \\
+    \hline
+    no rain & 3 & 2 \\
+\end{tabular}
+\end{minipage}
+\begin{minipage}{.6\textwidth}
+When choosing rain as a target variable, the entropy prior to partitioning is $H_{prior} = H(\frac{9}{14},\frac{5}{14})$, 
+after partitioning by temperature (hot/cold)$H_{hot}= H(\frac{4}{7}, \frac{3}{7})$
+and $H_{cold}= H(\frac{5}{7}, \frac{2}{7})$ remain.
+This leaves us with an expected entropy of
+$p_{hot} * H_{hot} + p_{cold} * H_{cold} $ .
+The \textbf{information gain} can then be calculated as the difference of entropy proor and post partitioning.
+Since $H_{prior}$ is constant in this equation, it is sufficient to minimize post-partitioning $E[H]$.
+\end{minipage}
+\caption{Example of information gain in decision trees}
+\label{ex:decisiontree}
+\end{figure}
+
+Advantages of decision trees over other machine learning approaches include low computation cost and
+interpretability, making it a popular choice for many applications.
+However, drawbacks include overfitting and poor robustness, where minimal alterations to training data
+can lead to a change in tree structure.
+
+\subsection{Cross-Entropy}
+When dealing with two distributions, the \textit{cross-entropy}, also called Kullback-Leibler divergence
+between a true distribution $p$
+and an estimated distribution $q$ is defined as:
+\begin{equation}
+    H(p, q) = -\sum_x p(x) \log_2 q(x)
+\end{equation}
+The \textit{Kullback–Leibler divergence} measures how much information is lost when $q$
+is used to approximate $p$:
+\begin{equation}
+    D_{KL}(p \| q) = H(p, q) - H(p)
+\end{equation}
+In machine learning, this term appears in many loss/cost functions — notably in classification problems 
+(cross-entropy loss) and in probabilistic models such as Variational Autoencoders (VAEs).
+There, the true and predicted label are used as the true and estimated distribution, respectively.
+In a supervised training example, the cross entropy loss degenerates to $-\log(p_{pred i})$ as the
+true label vector is assumed to be the unit vector $e_i$ (one-hot).
+
+\subsection{Coding}
+The concept of entropy also plays a crucial role in the design and evaluation of codes used for data compression and transmission.
+In this context, \textit{coding} refers to the representation of symbols or messages
+from a source using a finite set of codewords.
+Each codeword is typically composed of a sequence of bits,
+and the design goal is to minimize the average length of these codewords while maintaining unique decodability.
+
+According to Shannon's source coding theorem, the theoretical lower bound for the average codeword length of a source
+is given by its entropy $H$.
+In other words, no lossless coding scheme can achieve an average length smaller than the source entropy when expressed in bits.
+Codes that approach this bound are called \textit{efficient} or \textit{entropy-optimal}.
+A familiar example of such a scheme is \textit{Huffman coding},
+which assigns shorter codewords to more probable symbols and longer ones to less probable symbols,
+resulting in a prefix-free code with minimal expected length.
+
+Beyond compression, coding is essential for reliable communication over imperfect channels.
+In real-world systems, transmitted bits are often corrupted by noise, requiring mechanisms to detect and correct errors.
+One simple but powerful concept to quantify the robustness of a code is the \textit{Hamming distance}.
+The Hamming distance between two codewords is defined as the number of bit positions in which they differ.
+For example, the codewords $10110$ and $11100$ have a Hamming distance of 2.
+
+A code with a minimum Hamming distance $d_{min}$ can detect up to $d_{min}-1$ errors
+and correct up to $\lfloor (d_{min}-1)/2 \rfloor$ errors.
+This insight forms the basis of error-correcting codes such as Hamming codes,
+which add redundant bits to data in a structured way that enables the receiver to both identify and correct single-bit errors.
+
+Thus, the efficiency and reliability of communication systems are governed by a trade-off:
+higher redundancy (lower efficiency) provides greater error correction capability,
+while minimal redundancy maximizes data throughput but reduces error resilience.
+
+%Coding of a source of an information and communication channel
+% https://www.youtube.com/watch?v=ErfnhcEV1O8
+% relation to hamming distance and efficient codes
+
+\subsection{Noisy communication channels}
+The noisy channel coding theorem was stated by \textit{Claude Shannon} in 1948, but first rigorous proof was
+provided in 1954 by Amiel Feinstein.
+One of the important issues Shannon tackled with his 'Mathematical theory of commmunication'
+was the insufficient means of transporting discrete data through a noisy channel that were more efficient than
+the telegram - or, how to communicate reliably over an unreliable channel.
+The means of error correction until then had been limited to very basic means.
+
+\begin{figure}[H]
+\begin{tikzpicture}
+    \def\boxw{2.5cm}
+    \def\n{5}
+    \pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
+    % Draw the boxes
+    \node (A) at (0, 0) [draw, text width=\boxw, align=center] {Information Source};
+    \node (B) at (\boxw + \gap, 0) [draw, text width=\boxw, align=center] {Transmitter};
+    \node (C) at ({2*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Channel};
+    \node (N) at ({2*(\boxw + \gap)}, -1) [draw, text width=\boxw, align=center] {Noise};
+    \node (D) at ({3*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Receiver};
+    \node (E) at ({4*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Destination};
+    
+    % Draw arrows between the boxes
+    \draw[->] (A) -- (B);
+    \draw[->] (B) -- (C);
+    \draw[->] (C) -- (D);
+    \draw[->] (D) -- (E);
+    \draw[->] (N) -- (C);
+\end{tikzpicture}
+\caption{Model of a noisy communication channel}
+\label{fig:noisy-channel}
+\end{figure}
+First, analogue connections like the first telephone lines, bypassed the issue altogether and relied
+on the communicating parties and their brains' ability to filter human voices from the noise that was inevitably transmitted
+along with the intended signal.
+After some development, the telegraph in its final form used morse code, a series of long and short clicks, that,
+together with letter and word gaps, would encode text messages.
+Even though the long-short coding might appear similar to todays binary coding, the means of error correction were lacking.
+For a long time, it relied on simply repeating the message multiple times, which is highly inefficient.
+The destination would then have to determine the most likely intended message by performing a majority vote.
+One might also propose simply increasing transmitting power, thereby decreasing the error rate of the associated channel.
+However, the noisy channel coding theorem provides us with a more elegant solution.
+It is of foundational importance to information theory, stating that given a noisy channel with capacity $C$
+and information transmitted at rate $R$, there exists an $R<C$ so the error rate at the receiver can be
+arbitrarily small.
+
+
+
+\paragraph{Channel capacity and mutual information}
+For any discrete memoryless channel, we can describe its behavior with a conditional probability distribution
+$p(y|x)$ — the probability that symbol $y$ is received given symbol $x$ was sent.
+The \textit{mutual information} between the transmitted and received signals measures how much information, on average, passes through the channel:
+\begin{equation}
+    I(X;Y) = \sum_{x,y} p(x, y) \log_2 \frac{p(x, y)}{p(x)p(y)} = H(Y) - H(Y|X)
+\end{equation}
+The \textit{channel capacity} $C$ is then defined as the maximum achievable mutual information across all possible input distributions:
+\begin{equation}
+    C = \max_{p(x)} I(X;Y)
+\end{equation}
+It represents the highest rate (in bits per symbol) at which information can be transmitted with arbitrarily small error,
+given optimal encoding and decoding schemes.
+
+\paragraph{Binary symmetric channel (BSC)}
+The binary symmetric channel is one example of such discrete memoryless channels, where each transmitted bit
+has a probability $p$ of being flipped during transmission and a probability $(1-p)$ of being received correctly.
+
+\begin{figure}[H]
+\begin{tikzpicture}
+    \def\boxw{2.5cm}
+    \def\n{4}
+    \pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
+    \node (S) at (0,0) [draw, align=center, text width=\boxw] {Transmitter};
+    \node (S0) at (\boxw + \gap,1) [draw, circle] {0};
+    \node (S1) at (\boxw + \gap,-1) [draw, circle] {1};
+    \node (D0) at ({2*(\boxw + \gap)},1) [draw, circle] {0};
+    \node (D1) at ({2*(\boxw + \gap)},-1) [draw, circle] {1};
+    \node (D) at ({3*(\boxw + \gap)},0) [draw, align=center, text width=\boxw] {Receiver};
+
+    \draw[->] (S) -- (S0);
+    \draw[->] (S) -- (S1);
+
+    \draw[->,dashed] (S0) -- (D0) node[midway, above] {$1-p$};
+    \draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$p$};
+    \draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$p$};
+    \draw[->,dashed] (S1) -- (D1) node[midway, below] {$1-p$};
+
+    \draw[->] (D0) -- (D);
+    \draw[->] (D1) -- (D);
+\end{tikzpicture}
+\caption{Binary symmetric channel with crossover probability $p$}
+\label{fig:binary-channel}
+\end{figure}
+
+The capacity of the binary symmetric channel is given by:
+\begin{equation}
+    C = 1 - H_2(p)
+\end{equation}
+where $H_2(p) = -p \log_2(p) - (1-p)\log_2(1-p)$ is the binary entropy function.
+As $p$ increases, uncertainty grows and channel capacity declines.
+When $p = 0.5$, output bits are completely random and no information can be transmitted ($C = 0$).
+As illustrated in \autoref{fig:graph-entropy}, an error rate over $p > 0.5$ is equivalent to $ 1-p < 0.5$,
+though not relevant in practice.
+
+Shannon’s theorem is not constructive as it does not provide an explicit method for constructing such efficient codes,
+but it guarantees their existence.
+In practice, structured codes such as Hamming and Reed–Solomon codes are employed to approach channel capacity.
+
+\section{Conclusion}
+Entropy provides a fundamental measure of uncertainty and information,
+bridging concepts from thermodynamics to modern communication theory.
+
+Beyond the provided examples, the concept of entropy has far-reaching applications in diverse fields:
+from cryptography, where it quantifies randomness and security, 
+to statistical physics, where it characterizes disorder in complex systems,
+to biology, connecting molecular information and population diversity.
+
+\printbibliography

 \end{document}
Author	SHA1	Message	Date
eneller	df511b4a3e	update	2025-11-30 16:38:04 +01:00
eneller	597111974e	update	2025-11-30 15:24:08 +01:00
eneller	b67fa4db89	update	2025-11-28 12:57:59 +01:00
eneller	e10e311f0f	update	2025-11-26 23:27:47 +01:00
eneller	e015e816bd	update	2025-11-26 19:24:53 +01:00
eneller	25f725049e	clean up latex header	2025-11-26 18:40:11 +01:00
eneller	520d3f8bc6	update	2025-11-26 18:25:10 +01:00
eneller	31a3e956ab	begin compression	2025-11-25 13:14:17 +01:00
eneller	5f5ee2b395	minor updates	2025-11-03 11:06:19 +01:00
eneller	f8013bcee5	update	2025-10-30 15:47:54 +01:00
eneller	358274979f	update	2025-10-30 14:51:12 +01:00
eneller	3eb0f229fd	bib	2025-10-30 13:20:34 +01:00
eneller	90aa504539	entropy sources	2025-10-29 22:31:42 +01:00
eneller	d1b45ee1ec	begin entropy examples	2025-10-29 17:21:03 +01:00
eneller	b1989ee151	general entropy	2025-10-28 13:26:00 +01:00
				`@@ -0,0 +1,2 @@`

				`Error correction codes (theorems: Hamming condition, Varsham-Gilbert)`