Compare commits
1 Commits
f8013bcee5
...
33ce8dc03d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
33ce8dc03d |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -332,4 +332,3 @@ TSWLatexianTemp*
|
|||||||
|
|
||||||
# End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode
|
# End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode
|
||||||
bib
|
bib
|
||||||
*.*-SAVE-ERROR
|
|
||||||
|
|||||||
35
entropy.bib
35
entropy.bib
@@ -1,35 +0,0 @@
|
|||||||
@misc{ enwiki:shannon-hartley,
|
|
||||||
author = "{Wikipedia contributors}",
|
|
||||||
title = "Shannon–Hartley theorem --- {Wikipedia}{,} The Free Encyclopedia",
|
|
||||||
year = "2025",
|
|
||||||
url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633",
|
|
||||||
note = "[Online; accessed 29-October-2025]"
|
|
||||||
}
|
|
||||||
@misc{ enwiki:noisy-channel,
|
|
||||||
author = "{Wikipedia contributors}",
|
|
||||||
title = "Noisy-channel coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
|
|
||||||
year = "2025",
|
|
||||||
url = "https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870",
|
|
||||||
note = "[Online; accessed 29-October-2025]"
|
|
||||||
}
|
|
||||||
@misc{ enwiki:source-coding,
|
|
||||||
author = "{Wikipedia contributors}",
|
|
||||||
title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
|
|
||||||
year = "2025",
|
|
||||||
url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440",
|
|
||||||
note = "[Online; accessed 29-October-2025]"
|
|
||||||
}
|
|
||||||
@misc{ dewiki:nyquist-shannon,
|
|
||||||
author = "Wikipedia",
|
|
||||||
title = "Nyquist-Shannon-Abtasttheorem --- Wikipedia{,} die freie Enzyklopädie",
|
|
||||||
year = "2025",
|
|
||||||
url = "https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066",
|
|
||||||
note = "[Online; Stand 29. Oktober 2025]"
|
|
||||||
}
|
|
||||||
@misc{ enwiki:information-content,
|
|
||||||
author = "{Wikipedia contributors}",
|
|
||||||
title = "Information content --- {Wikipedia}{,} The Free Encyclopedia",
|
|
||||||
year = "2025",
|
|
||||||
url = "https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600",
|
|
||||||
note = "[Online; accessed 29-October-2025]"
|
|
||||||
}
|
|
||||||
316
entropy.tex
316
entropy.tex
@@ -2,20 +2,14 @@
|
|||||||
\usepackage[utf8x]{inputenc}
|
\usepackage[utf8x]{inputenc}
|
||||||
\usepackage[margin=1in]{geometry} % Adjust margins
|
\usepackage[margin=1in]{geometry} % Adjust margins
|
||||||
\usepackage{caption}
|
\usepackage{caption}
|
||||||
\usepackage{wrapfig}
|
|
||||||
\usepackage{subcaption}
|
\usepackage{subcaption}
|
||||||
\usepackage{parskip} % dont indent after paragraphs, figures
|
\usepackage{parskip} % dont indent after paragraphs, figures
|
||||||
\usepackage{xcolor}
|
\usepackage{xcolor}
|
||||||
%\usepackage{csquotes} % Recommended for biblatex
|
|
||||||
\usepackage{tikz}
|
\usepackage{tikz}
|
||||||
\usepackage{pgfplots}
|
|
||||||
\usetikzlibrary{positioning}
|
|
||||||
\usepackage{float}
|
\usepackage{float}
|
||||||
\usepackage{amsmath}
|
\usepackage{amsmath}
|
||||||
\PassOptionsToPackage{hyphens}{url}
|
\PassOptionsToPackage{hyphens}{url}
|
||||||
\usepackage{hyperref} % allows urls to follow line breaks of text
|
\usepackage{hyperref} % allows urls to follow line breaks of text
|
||||||
\usepackage[style=ieee, backend=biber, maxnames=1, minnames=1]{biblatex}
|
|
||||||
\addbibresource{entropy.bib}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -32,9 +26,8 @@ Originating in classical thermodynamics,
|
|||||||
over time it has been applied in different sciences such as chemistry and information theory.
|
over time it has been applied in different sciences such as chemistry and information theory.
|
||||||
%As the informal concept of entropy gains popularity, its specific meaning can feel far-fetched and ambiguous.
|
%As the informal concept of entropy gains popularity, its specific meaning can feel far-fetched and ambiguous.
|
||||||
The name 'entropy' was first coined by german physicist \textit{Rudolf Clausius} in 1865
|
The name 'entropy' was first coined by german physicist \textit{Rudolf Clausius} in 1865
|
||||||
while postulating the second law of thermodynamics, one of 3(4)
|
while postulating the second law of thermodynamics.
|
||||||
laws of thermodynamics based on universal observation regarding heat and energy conversion.
|
The laws of thermodynamics are based on universal observation regarding heat and energy conversion.
|
||||||
|
|
||||||
Specifically, the second law states that not all thermal energy can be converted into work in a cyclic process.
|
Specifically, the second law states that not all thermal energy can be converted into work in a cyclic process.
|
||||||
Or, in other words, that the entropy of an isolated system cannot decrease,
|
Or, in other words, that the entropy of an isolated system cannot decrease,
|
||||||
as they always tend toward a state of thermodynamic equilibrium where entropy is highest for a given internal energy.
|
as they always tend toward a state of thermodynamic equilibrium where entropy is highest for a given internal energy.
|
||||||
@@ -49,312 +42,15 @@ that is still in use in information theory today.
|
|||||||
\begin{equation}
|
\begin{equation}
|
||||||
S = -k_B \sum_i p_i \ln(p_i)
|
S = -k_B \sum_i p_i \ln(p_i)
|
||||||
\end{equation}
|
\end{equation}
|
||||||
It gives statistical meaning to the macroscopic phenomenon of classical thermodynamics
|
It gives statical meaning to the macroscopical phenomenon by defining the entropy $S$ of a macrostate as
|
||||||
by defining the entropy $S$ of a macrostate as
|
|
||||||
the result of probabilities $p_i$ of all its constituting micro states.
|
the result of probabilities $p_i$ of all its constituting micro states.
|
||||||
$k_B$ refers to the Boltzmann constant, which he himself did not determine but is part of todays SI system.
|
|
||||||
|
|
||||||
\section{Shannon's axioms}
|
|
||||||
\textit{Claude Shannon} adapted the concept of entropy to information theory.
|
|
||||||
In an era of advancing communication technologies, the question he addressed was of increasing importance:
|
|
||||||
How can messages be encoded and transmitted efficiently?
|
|
||||||
He proposed 3(4) axioms a measure of information would have to comply with:
|
|
||||||
\begin{enumerate}
|
|
||||||
\item $I(1) = 0$: events that always occur do not communicate information.
|
|
||||||
\item $ I'(p) \leq 0$ is monotonically decreasing in p: an increase in the probability of an event
|
|
||||||
decreases the information from an observed event, and vice versa.
|
|
||||||
\item $I(p_1 \cdot p_2) = I(p_1) + I(p_2)$: the information learned from independent events
|
|
||||||
is the sum of the information learned from each event.
|
|
||||||
\item $I(p)$ is a twice continuously differentiable function of p.
|
|
||||||
\end{enumerate}
|
|
||||||
|
|
||||||
As a measure, Shannon's formula uses the \textit{Bit}, quantifying the efficiency of codes
|
|
||||||
and media for transmission and storage.
|
|
||||||
In information theory, entropy can be understood as the expected information of a message.
|
|
||||||
\begin{equation}
|
|
||||||
H = E(I) = - \sum_i p_i \log_2(p_i)
|
|
||||||
\label{eq:entropy-information}
|
|
||||||
\end{equation}
|
|
||||||
This leaves $ I =log(1/p_i) = - log_2(p_i)$, implying that an unexpected message (low probability) carries
|
|
||||||
more information than one with higher probability.
|
|
||||||
Intuitively, we can imagine David A. Johnston, a volcanologist reporting day after day that there is no
|
|
||||||
activity on Mount St. Helens. After a while, we grow to expect this message because it is statistically very likely
|
|
||||||
that tomorrows message will be the same. When some day we get the message 'Vancouver! This is it!' it carries a lot of information
|
|
||||||
not only semantically (because it announces the eruption of a volcano) but statistically because it was very unlikely
|
|
||||||
given the transmission history.
|
|
||||||
|
|
||||||
However, uncertainty (entropy) in this situation would be relatively low.
|
|
||||||
Because we attach high surprise only to the unlikely message of an eruption, the significantly more likely message
|
|
||||||
carries less information - we already expected it before it arrived.
|
|
||||||
|
|
||||||
Putting the axioms and our intuitive understanding of information and uncertainty together,
|
|
||||||
we can see the logarithmic decay of information transported by a message as its probability increases in \autoref{fig:graph-information},
|
|
||||||
as well as the entropy for a 2-event source given by solving \autoref{eq:entropy-information} for $i=2$, resulting in
|
|
||||||
$-p * \log_2(p) - (1-p) * \log_2(1-p) $.
|
|
||||||
|
|
||||||
\begin{figure}[H]
|
|
||||||
\begin{minipage}{.5\textwidth}
|
|
||||||
\begin{tikzpicture}
|
|
||||||
\begin{axis}[
|
|
||||||
domain=0:1,
|
|
||||||
samples=100,
|
|
||||||
axis lines=middle,
|
|
||||||
xlabel={$p$},
|
|
||||||
ylabel={Information [bits]},
|
|
||||||
xmin=0, xmax=1,
|
|
||||||
ymin=0, ymax=6.1,
|
|
||||||
grid=both,
|
|
||||||
width=8cm,
|
|
||||||
height=6cm,
|
|
||||||
every axis x label/.style={at={(current axis.right of origin)}, anchor=west},
|
|
||||||
every axis y label/.style={at={(current axis.above origin)}, anchor=south},
|
|
||||||
]
|
|
||||||
\addplot[thick, blue] {-log2(x)};
|
|
||||||
\end{axis}
|
|
||||||
\end{tikzpicture}
|
|
||||||
\caption{Information contained in a message depending on its probability $p$}
|
|
||||||
\label{fig:graph-information}
|
|
||||||
\end{minipage}
|
|
||||||
\begin{minipage}{.5\textwidth}
|
|
||||||
\begin{tikzpicture}
|
|
||||||
\begin{axis}[
|
|
||||||
domain=0:1,
|
|
||||||
samples=100,
|
|
||||||
axis lines=middle,
|
|
||||||
xlabel={$p$},
|
|
||||||
ylabel={Entropy [bits]},
|
|
||||||
xmin=0, xmax=1,
|
|
||||||
ymin=0, ymax=1.1,
|
|
||||||
xtick={0,0.25,0.5,0.75,1},
|
|
||||||
grid=both,
|
|
||||||
width=8cm,
|
|
||||||
height=6cm,
|
|
||||||
every axis x label/.style={at={(current axis.right of origin)}, anchor=west},
|
|
||||||
every axis y label/.style={at={(current axis.above origin)}, anchor=south},
|
|
||||||
]
|
|
||||||
\addplot[thick, blue] {-x * log2(x) - (1-x) * log2(1-x)};
|
|
||||||
\end{axis}
|
|
||||||
\end{tikzpicture}
|
|
||||||
\caption{Entropy of an event source with two possible events, depending on their probabilities $(p, 1-p)$}
|
|
||||||
\label{fig:graph-entropy}
|
|
||||||
\end{minipage}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
The base 2 is chosen for the logarithm as our computers rely on a system of the same base, but theoretically
|
|
||||||
arbitrary bases can be used as they are proportional according to $\log_a b = \frac{\log_c b}{\log_c a} $.
|
|
||||||
|
|
||||||
Further, the $\log_2$ can be intuitively understood for an event source with $2^n$ possible outcomes -
|
|
||||||
using standard binary coding, we can easily see that a message has to contain $\log_2(2^n) = n$ Bits
|
|
||||||
in order to be able to encode all possible outcomes.
|
|
||||||
For numbers where $a \neq 2^n$ such as $a=10$, it is easy to see that there exists a number $a^k = 2^n$
|
|
||||||
which defines a message size that can encode the outcomes of $k$ event sources with $a$ outcomes each,
|
|
||||||
leaving the required Bits per event source at $\log_2(a^k) \div k = \log_2(a)$.
|
|
||||||
|
|
||||||
|
\section{Definitions across disciplines}
|
||||||
%- bedingte Entropie
|
%- bedingte Entropie
|
||||||
%- Redundanz
|
%- Redundanz
|
||||||
%- Quellentropie
|
%- Quellentropie
|
||||||
\section{Applications}
|
\section{Shannon Axioms}
|
||||||
\subsection{Decision Trees}
|
\section{Coding of a source of an information and communication channel}
|
||||||
A decision tree is a supervised learning approach commonly used in machine learning.
|
|
||||||
The goal is to create an algorithm, i.e a series of questions to pose to new data (input variables)
|
|
||||||
in order to predict the target variable, a class label.
|
|
||||||
Graphically, each question can be visualized as a node in a tree, splitting the dataset into two or more groups.
|
|
||||||
This process is applied to the source set and then its resulting sets in a process called \textit{recursive partitioning}.
|
|
||||||
Once a leaf is reached, the class of the input has been successfully determined.
|
|
||||||
|
|
||||||
In order to build the shallowest possible trees, we want to use input variables that minimize uncertainty.
|
|
||||||
While other measures for the best choice such as the \textit{Gini coefficient} exist,
|
|
||||||
entropy is a popular measure used in decision trees.
|
|
||||||
|
|
||||||
Using what we learned about entropy, we want the maximum decrease in entropy of our target variable,
|
|
||||||
as explained in~\autoref{ex:decisiontree}.
|
|
||||||
\begin{figure}[H]
|
|
||||||
\centering
|
|
||||||
\begin{minipage}{.3\textwidth}
|
|
||||||
\begin{tabular}{c|c|c}
|
|
||||||
& hot & cold \\
|
|
||||||
\hline
|
|
||||||
rain &4 &5 \\
|
|
||||||
\hline
|
|
||||||
no rain & 3 & 2 \\
|
|
||||||
\end{tabular}
|
|
||||||
\end{minipage}
|
|
||||||
\begin{minipage}{.6\textwidth}
|
|
||||||
When choosing rain as a target variable, the entropy prior to partitioning is $H_{prior} = H(\frac{9}{14},\frac{5}{14})$,
|
|
||||||
after partitioning by temperature (hot/cold)$H_{hot}= H(\frac{4}{7}, \frac{3}{7})$
|
|
||||||
and $H_{cold}= H(\frac{5}{7}, \frac{2}{7})$ remain.
|
|
||||||
This leaves us with an expected entropy of
|
|
||||||
$p_{hot} * H_{hot} + p_{cold} * H_{cold} $ .
|
|
||||||
The \textbf{information gain} can then be calculated as the difference of entropy proor and post partitioning.
|
|
||||||
Since $H_{prior}$ is constant in this equation, it is sufficient to minimize post-partitioning $E[H]$.
|
|
||||||
\end{minipage}
|
|
||||||
\caption{Example of information gain in decision trees}
|
|
||||||
\label{ex:decisiontree}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
Advantages of decision trees over other machine learning approaches include low computation cost and
|
|
||||||
interpretability, making it a popular choice for many applications.
|
|
||||||
However, drawbacks include overfitting and poor robustness, where minimal alterations to training data
|
|
||||||
can lead to a change in tree structure.
|
|
||||||
|
|
||||||
\subsection{Cross-Entropy}
|
|
||||||
When dealing with two distributions, the \textit{cross-entropy}, also called Kullback-Leibler divergence
|
|
||||||
between a true distribution $p$
|
|
||||||
and an estimated distribution $q$ is defined as:
|
|
||||||
\begin{equation}
|
|
||||||
H(p, q) = -\sum_x p(x) \log_2 q(x)
|
|
||||||
\end{equation}
|
|
||||||
The \textit{Kullback–Leibler divergence} measures how much information is lost when $q$
|
|
||||||
is used to approximate $p$:
|
|
||||||
\begin{equation}
|
|
||||||
D_{KL}(p \| q) = H(p, q) - H(p)
|
|
||||||
\end{equation}
|
|
||||||
In machine learning, this term appears in many loss/cost functions — notably in classification problems
|
|
||||||
(cross-entropy loss) and in probabilistic models such as Variational Autoencoders (VAEs).
|
|
||||||
There, the true and predicted label are used as the true and estimated distribution, respectively.
|
|
||||||
In a supervised training example, the cross entropy loss degenerates to $-\log(p_{pred i})$ as the
|
|
||||||
true label vector is assumed to be the unit vector $e_i$ (one-hot).
|
|
||||||
|
|
||||||
\subsection{Coding}
|
|
||||||
The concept of entropy also plays a crucial role in the design and evaluation of codes used for data compression and transmission.
|
|
||||||
In this context, \textit{coding} refers to the representation of symbols or messages
|
|
||||||
from a source using a finite set of codewords.
|
|
||||||
Each codeword is typically composed of a sequence of bits,
|
|
||||||
and the design goal is to minimize the average length of these codewords while maintaining unique decodability.
|
|
||||||
|
|
||||||
According to Shannon's source coding theorem, the theoretical lower bound for the average codeword length of a source
|
|
||||||
is given by its entropy $H$.
|
|
||||||
In other words, no lossless coding scheme can achieve an average length smaller than the source entropy when expressed in bits.
|
|
||||||
Codes that approach this bound are called \textit{efficient} or \textit{entropy-optimal}.
|
|
||||||
A familiar example of such a scheme is \textit{Huffman coding},
|
|
||||||
which assigns shorter codewords to more probable symbols and longer ones to less probable symbols,
|
|
||||||
resulting in a prefix-free code with minimal expected length.
|
|
||||||
|
|
||||||
Beyond compression, coding is essential for reliable communication over imperfect channels.
|
|
||||||
In real-world systems, transmitted bits are often corrupted by noise, requiring mechanisms to detect and correct errors.
|
|
||||||
One simple but powerful concept to quantify the robustness of a code is the \textit{Hamming distance}.
|
|
||||||
The Hamming distance between two codewords is defined as the number of bit positions in which they differ.
|
|
||||||
For example, the codewords $10110$ and $11100$ have a Hamming distance of 2.
|
|
||||||
|
|
||||||
A code with a minimum Hamming distance $d_{min}$ can detect up to $d_{min}-1$ errors
|
|
||||||
and correct up to $\lfloor (d_{min}-1)/2 \rfloor$ errors.
|
|
||||||
This insight forms the basis of error-correcting codes such as Hamming codes,
|
|
||||||
which add redundant bits to data in a structured way that enables the receiver to both identify and correct single-bit errors.
|
|
||||||
|
|
||||||
Thus, the efficiency and reliability of communication systems are governed by a trade-off:
|
|
||||||
higher redundancy (lower efficiency) provides greater error correction capability,
|
|
||||||
while minimal redundancy maximizes data throughput but reduces error resilience.
|
|
||||||
|
|
||||||
%Coding of a source of an information and communication channel
|
|
||||||
% https://www.youtube.com/watch?v=ErfnhcEV1O8
|
|
||||||
% relation to hamming distance and efficient codes
|
|
||||||
|
|
||||||
\subsection{Noisy communication channels}
|
|
||||||
The noisy channel coding theorem was stated by \textit{Claude Shannon} in 1948, but first rigorous proof was
|
|
||||||
provided in 1954 by Amiel Feinstein.
|
|
||||||
One of the important issues Shannon tackled with his 'Mathematical theory of commmunication'
|
|
||||||
was the insufficient means of transporting discrete data through a noisy channel that were more efficient than
|
|
||||||
the telegram - or, how to communicate reliably over an unreliable channel.
|
|
||||||
The means of error correction until then had been limited to very basic means.
|
|
||||||
|
|
||||||
\begin{figure}[H]
|
|
||||||
\begin{tikzpicture}
|
|
||||||
\def\boxw{2.5cm}
|
|
||||||
\def\n{5}
|
|
||||||
\pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
|
|
||||||
% Draw the boxes
|
|
||||||
\node (A) at (0, 0) [draw, text width=\boxw, align=center] {Information Source};
|
|
||||||
\node (B) at (\boxw + \gap, 0) [draw, text width=\boxw, align=center] {Transmitter};
|
|
||||||
\node (C) at ({2*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Channel};
|
|
||||||
\node (N) at ({2*(\boxw + \gap)}, -1) [draw, text width=\boxw, align=center] {Noise};
|
|
||||||
\node (D) at ({3*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Receiver};
|
|
||||||
\node (E) at ({4*(\boxw + \gap)}, 0) [draw, text width=\boxw, align=center] {Destination};
|
|
||||||
|
|
||||||
% Draw arrows between the boxes
|
|
||||||
\draw[->] (A) -- (B);
|
|
||||||
\draw[->] (B) -- (C);
|
|
||||||
\draw[->] (C) -- (D);
|
|
||||||
\draw[->] (D) -- (E);
|
|
||||||
\draw[->] (N) -- (C);
|
|
||||||
\end{tikzpicture}
|
|
||||||
\caption{Model of a noisy communication channel}
|
|
||||||
\label{fig:noisy-channel}
|
|
||||||
\end{figure}
|
|
||||||
First, analogue connections like the first telephone lines, bypassed the issue altogether and relied
|
|
||||||
on the communicating parties and their brains' ability to filter human voices from the noise that was inevitably transmitted
|
|
||||||
along with the intended signal.
|
|
||||||
After some development, the telegraph in its final form used morse code, a series of long and short clicks, that,
|
|
||||||
together with letter and word gaps, would encode text messages.
|
|
||||||
Even though the long-short coding might appear similar to todays binary coding, the means of error correction were lacking.
|
|
||||||
For a long time, it relied on simply repeating the message multiple times, which is highly inefficient.
|
|
||||||
The destination would then have to determine the most likely intended message by performing a majority vote.
|
|
||||||
One might also propose simply increasing transmitting power, thereby decreasing the error rate of the associated channel.
|
|
||||||
However, the noisy channel coding theorem provides us with a more elegant solution.
|
|
||||||
It is of foundational importance to information theory, stating that given a noisy channel with capacity $C$
|
|
||||||
and information transmitted at rate $R$, there exists an $R<C$ so the error rate at the receiver can be
|
|
||||||
arbitrarily small.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
\paragraph{Channel capacity and mutual information}
|
|
||||||
For any discrete memoryless channel, we can describe its behavior with a conditional probability distribution
|
|
||||||
$p(y|x)$ — the probability that symbol $y$ is received given symbol $x$ was sent.
|
|
||||||
The \textit{mutual information} between the transmitted and received signals measures how much information, on average, passes through the channel:
|
|
||||||
\begin{equation}
|
|
||||||
I(X;Y) = \sum_{x,y} p(x, y) \log_2 \frac{p(x, y)}{p(x)p(y)} = H(Y) - H(Y|X)
|
|
||||||
\end{equation}
|
|
||||||
The \textit{channel capacity} $C$ is then defined as the maximum achievable mutual information across all possible input distributions:
|
|
||||||
\begin{equation}
|
|
||||||
C = \max_{p(x)} I(X;Y)
|
|
||||||
\end{equation}
|
|
||||||
It represents the highest rate (in bits per symbol) at which information can be transmitted with arbitrarily small error,
|
|
||||||
given optimal encoding and decoding schemes.
|
|
||||||
|
|
||||||
\paragraph{Binary symmetric channel (BSC)}
|
|
||||||
The binary symmetric channel is one example of such discrete memoryless channels, where each transmitted bit
|
|
||||||
has a probability $p$ of being flipped during transmission and a probability $(1-p)$ of being received correctly.
|
|
||||||
|
|
||||||
\begin{figure}[H]
|
|
||||||
\begin{tikzpicture}
|
|
||||||
\def\boxw{2.5cm}
|
|
||||||
\def\n{4}
|
|
||||||
\pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
|
|
||||||
\node (S) at (0,0) [draw, align=center, text width=\boxw] {Transmitter};
|
|
||||||
\node (S0) at (\boxw + \gap,1) [draw, circle] {0};
|
|
||||||
\node (S1) at (\boxw + \gap,-1) [draw, circle] {1};
|
|
||||||
\node (D0) at ({2*(\boxw + \gap)},1) [draw, circle] {0};
|
|
||||||
\node (D1) at ({2*(\boxw + \gap)},-1) [draw, circle] {1};
|
|
||||||
\node (D) at ({3*(\boxw + \gap)},0) [draw, align=center, text width=\boxw] {Receiver};
|
|
||||||
|
|
||||||
\draw[->] (S) -- (S0);
|
|
||||||
\draw[->] (S) -- (S1);
|
|
||||||
|
|
||||||
\draw[->,dashed] (S0) -- (D0) node[midway, above] {$1-p$};
|
|
||||||
\draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$p$};
|
|
||||||
\draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$p$};
|
|
||||||
\draw[->,dashed] (S1) -- (D1) node[midway, below] {$1-p$};
|
|
||||||
|
|
||||||
\draw[->] (D0) -- (D);
|
|
||||||
\draw[->] (D1) -- (D);
|
|
||||||
\end{tikzpicture}
|
|
||||||
\caption{Binary symmetric channel with crossover probability $p$}
|
|
||||||
\label{fig:binary-channel}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
The capacity of the binary symmetric channel is given by:
|
|
||||||
\begin{equation}
|
|
||||||
C = 1 - H_2(p)
|
|
||||||
\end{equation}
|
|
||||||
where $H_2(p) = -p \log_2(p) - (1-p)\log_2(1-p)$ is the binary entropy function.
|
|
||||||
As $p$ increases, uncertainty grows and channel capacity declines.
|
|
||||||
When $p = 0.5$, output bits are completely random and no information can be transmitted ($C = 0$).
|
|
||||||
As already shown in \autoref{fig:graph-entropy}, an error rate over $p > 0.5$ is equivalent to $ 1-p < 0.5$,
|
|
||||||
though not relevant in practice.
|
|
||||||
|
|
||||||
Shannon’s theorem is not constructive as it does not provide an explicit method for constructing such efficient codes,
|
|
||||||
but it guarantees their existence.
|
|
||||||
In practice, structured codes such as Hamming and Reed–Solomon codes are employed to approach channel capacity.
|
|
||||||
|
|
||||||
\printbibliography
|
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
Reference in New Issue
Block a user