bib
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -332,3 +332,4 @@ TSWLatexianTemp*
|
|||||||
|
|
||||||
# End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode
|
# End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode
|
||||||
bib
|
bib
|
||||||
|
*.*-SAVE-ERROR
|
||||||
|
|||||||
10
entropy.bib
10
entropy.bib
@@ -2,34 +2,34 @@
|
|||||||
author = "{Wikipedia contributors}",
|
author = "{Wikipedia contributors}",
|
||||||
title = "Shannon–Hartley theorem --- {Wikipedia}{,} The Free Encyclopedia",
|
title = "Shannon–Hartley theorem --- {Wikipedia}{,} The Free Encyclopedia",
|
||||||
year = "2025",
|
year = "2025",
|
||||||
howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633}",
|
url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633",
|
||||||
note = "[Online; accessed 29-October-2025]"
|
note = "[Online; accessed 29-October-2025]"
|
||||||
}
|
}
|
||||||
@misc{ enwiki:noisy-channel,
|
@misc{ enwiki:noisy-channel,
|
||||||
author = "{Wikipedia contributors}",
|
author = "{Wikipedia contributors}",
|
||||||
title = "Noisy-channel coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
|
title = "Noisy-channel coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
|
||||||
year = "2025",
|
year = "2025",
|
||||||
howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870}",
|
url = "https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870",
|
||||||
note = "[Online; accessed 29-October-2025]"
|
note = "[Online; accessed 29-October-2025]"
|
||||||
}
|
}
|
||||||
@misc{ enwiki:source-coding,
|
@misc{ enwiki:source-coding,
|
||||||
author = "{Wikipedia contributors}",
|
author = "{Wikipedia contributors}",
|
||||||
title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
|
title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
|
||||||
year = "2025",
|
year = "2025",
|
||||||
howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440}",
|
url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440",
|
||||||
note = "[Online; accessed 29-October-2025]"
|
note = "[Online; accessed 29-October-2025]"
|
||||||
}
|
}
|
||||||
@misc{ dewiki:nyquist-shannon,
|
@misc{ dewiki:nyquist-shannon,
|
||||||
author = "Wikipedia",
|
author = "Wikipedia",
|
||||||
title = "Nyquist-Shannon-Abtasttheorem --- Wikipedia{,} die freie Enzyklopädie",
|
title = "Nyquist-Shannon-Abtasttheorem --- Wikipedia{,} die freie Enzyklopädie",
|
||||||
year = "2025",
|
year = "2025",
|
||||||
url = "\url{https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066}",
|
url = "https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066",
|
||||||
note = "[Online; Stand 29. Oktober 2025]"
|
note = "[Online; Stand 29. Oktober 2025]"
|
||||||
}
|
}
|
||||||
@misc{ enwiki:information-content,
|
@misc{ enwiki:information-content,
|
||||||
author = "{Wikipedia contributors}",
|
author = "{Wikipedia contributors}",
|
||||||
title = "Information content --- {Wikipedia}{,} The Free Encyclopedia",
|
title = "Information content --- {Wikipedia}{,} The Free Encyclopedia",
|
||||||
year = "2025",
|
year = "2025",
|
||||||
howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600}",
|
url = "https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600",
|
||||||
note = "[Online; accessed 29-October-2025]"
|
note = "[Online; accessed 29-October-2025]"
|
||||||
}
|
}
|
||||||
|
|||||||
77
entropy.tex
77
entropy.tex
@@ -5,6 +5,7 @@
|
|||||||
\usepackage{subcaption}
|
\usepackage{subcaption}
|
||||||
\usepackage{parskip} % dont indent after paragraphs, figures
|
\usepackage{parskip} % dont indent after paragraphs, figures
|
||||||
\usepackage{xcolor}
|
\usepackage{xcolor}
|
||||||
|
%\usepackage{csquotes} % Recommended for biblatex
|
||||||
\usepackage{tikz}
|
\usepackage{tikz}
|
||||||
\usepackage{float}
|
\usepackage{float}
|
||||||
\usepackage{amsmath}
|
\usepackage{amsmath}
|
||||||
@@ -58,9 +59,9 @@ As a measure, Shannon's formula uses the \textit{Bit}, quantifying the efficienc
|
|||||||
and media for transmission and storage.
|
and media for transmission and storage.
|
||||||
According to his axioms, a measure for information has to comply with the following criteria:
|
According to his axioms, a measure for information has to comply with the following criteria:
|
||||||
\begin{enumerate}
|
\begin{enumerate}
|
||||||
|
\item $I(1) = 0$: events that always occur do not communicate information.
|
||||||
\item $I(p)$ is monotonically decreasing in p: an increase in the probability of an event
|
\item $I(p)$ is monotonically decreasing in p: an increase in the probability of an event
|
||||||
decreases the information from an observed event, and vice versa.
|
decreases the information from an observed event, and vice versa.
|
||||||
\item $I(1) = 0$: events that always occur do not communicate information.
|
|
||||||
\item $I(p_1 \cdot p_2) = I(p_1) + I(p_2)$: the information learned from independent events
|
\item $I(p_1 \cdot p_2) = I(p_1) + I(p_2)$: the information learned from independent events
|
||||||
is the sum of the information learned from each event.
|
is the sum of the information learned from each event.
|
||||||
\item $I(p)$ is a twice continuously differentiable function of p.
|
\item $I(p)$ is a twice continuously differentiable function of p.
|
||||||
@@ -69,7 +70,7 @@ In information theory, entropy can be understood as the expected information of
|
|||||||
\begin{equation}
|
\begin{equation}
|
||||||
H = E(I) = - \sum_i p_i \log_2(p_i)
|
H = E(I) = - \sum_i p_i \log_2(p_i)
|
||||||
\end{equation}
|
\end{equation}
|
||||||
This leaves $ I =log(1/p) = - log_2(p_i)$, implying that an unexpected message (low probability) carries
|
This leaves $ I =log(1/p_i) = - log_2(p_i)$, implying that an unexpected message (low probability) carries
|
||||||
more information than one with higher probability.
|
more information than one with higher probability.
|
||||||
Intuitively, we can imagine David A. Johnston, a volcanologist reporting day after day that there is no
|
Intuitively, we can imagine David A. Johnston, a volcanologist reporting day after day that there is no
|
||||||
activity on Mount St. Helens. After a while, we grow to expect this message because it is statistically very likely
|
activity on Mount St. Helens. After a while, we grow to expect this message because it is statistically very likely
|
||||||
@@ -92,7 +93,7 @@ leaving the required Bits per event source at $\log_2(a^k) \div k = \log_2(a)$.
|
|||||||
%- Quellentropie
|
%- Quellentropie
|
||||||
\section{Applications}
|
\section{Applications}
|
||||||
\subsection{Decision Trees}
|
\subsection{Decision Trees}
|
||||||
A decision tree is a supervised learning approach commonly used in data mining.
|
A decision tree is a supervised learning approach commonly used in machine learning.
|
||||||
The goal is to create an algorithm, i.e a series of questions to pose to new data (input variables)
|
The goal is to create an algorithm, i.e a series of questions to pose to new data (input variables)
|
||||||
in order to predict the target variable, a class label.
|
in order to predict the target variable, a class label.
|
||||||
Graphically, each question can be visualized as a node in a tree, splitting the dataset into two or more groups.
|
Graphically, each question can be visualized as a node in a tree, splitting the dataset into two or more groups.
|
||||||
@@ -100,15 +101,11 @@ This process is applied to the source set and then its resulting sets in a proce
|
|||||||
Once a leaf is reached, the class of the input has been successfully determined.
|
Once a leaf is reached, the class of the input has been successfully determined.
|
||||||
|
|
||||||
In order to build the shallowest possible trees, we want to use input variables that minimize uncertainty.
|
In order to build the shallowest possible trees, we want to use input variables that minimize uncertainty.
|
||||||
While other measures for the best choice such as the Gini coefficient exist,
|
While other measures for the best choice such as the \textit{Gini coefficient} exist,
|
||||||
entropy is a popular measure used in decision trees.
|
entropy is a popular measure used in decision trees.
|
||||||
|
|
||||||
Using the previous learnings about information and entropy, we of course want to ask questions that have the
|
Using what we learned about entropy, we want the maximum decrease in entropy of our target variable,
|
||||||
highest information content, so pertaining input variables with the highest entropy.
|
as explained in~\autoref{ex:decisiontree}.
|
||||||
|
|
||||||
Zu berechnen ist also die Entropie des Klassifikators abzüglich der erwarteten Entropie
|
|
||||||
nach der Aufteilung anhand des Attributs,
|
|
||||||
erklärt in~\autoref{ex:decisiontree}.
|
|
||||||
\begin{figure}[H]
|
\begin{figure}[H]
|
||||||
\centering
|
\centering
|
||||||
\begin{minipage}{.3\textwidth}
|
\begin{minipage}{.3\textwidth}
|
||||||
@@ -121,25 +118,23 @@ erklärt in~\autoref{ex:decisiontree}.
|
|||||||
\end{tabular}
|
\end{tabular}
|
||||||
\end{minipage}
|
\end{minipage}
|
||||||
\begin{minipage}{.6\textwidth}
|
\begin{minipage}{.6\textwidth}
|
||||||
Die Entropie für Regen ist also $H_{prior} = H(\frac{9}{14},\frac{5}{14})$,
|
When choosing rain as a target variable, the entropy prior to partitioning is $H_{prior} = H(\frac{9}{14},\frac{5}{14})$,
|
||||||
nach der Aufteilung anhand des Attributs Temperatur beträgt sie $H_{warm}= H(\frac{4}{7}, \frac{3}{7})$
|
after partitioning by temperature (hot/cold)$H_{hot}= H(\frac{4}{7}, \frac{3}{7})$
|
||||||
und $H_{kalt}= H(\frac{5}{7}, \frac{2}{7})$.
|
and $H_{cold}= H(\frac{5}{7}, \frac{2}{7})$ remain.
|
||||||
Also ist die erwartete (Erwartungswert) Entropie nach der Aufteilung anhand der Temperatur $p_{warm} * H_{warm} + p_{kalt} * H_{kalt} $ .
|
This leaves us with an expected entropy of
|
||||||
Der \textbf{Informationsgewinn} berechnet sich dann als Entropie für Regen abzüglich des Erwartungswertes der Entropie.
|
$p_{hot} * H_{hot} + p_{cold} * H_{cold} $ .
|
||||||
Da $H_{prior}$ in dieser Berechnung jedoch konstant ist, kann auch einfach $E[H]$ nach der Aufteilung minimiert werden.
|
The \textbf{information gain} can then be calculated as the difference of entropy proor and post partitioning.
|
||||||
|
Since $H_{prior}$ is constant in this equation, it is sufficient to minimize post-partitioning $E[H]$.
|
||||||
\end{minipage}
|
\end{minipage}
|
||||||
\caption{Beispiel Informationsgewinn für Aufbau eines Entscheidungsbaums}
|
\caption{Example of information gain in decision trees}
|
||||||
\label{ex:decisiontree}
|
\label{ex:decisiontree}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
|
Advantages of decision trees over other machine learning approaches include low computation cost and
|
||||||
|
interpretability, making it a popular choice for many applications.
|
||||||
|
However, drawbacks include overfitting and poor robustness, where minimal alterations to training data
|
||||||
|
can lead to a change in tree structure.
|
||||||
|
|
||||||
Vorteil: Niedriger Rechenaufwand und nachvollziehbarer Aufbau
|
|
||||||
Nachteil: Overfitting, geringe Robustheit: bereits kleine Änderungen der Trainingsdaten können zu einer Veränderung des Baums führen.
|
|
||||||
|
|
||||||
|
|
||||||
Kann zu einem \textbf{Random Forest} (bagging) erweitert werden um die Robustheit zu steigern und Overfitting zu verringern.
|
|
||||||
Resultiert jedoch in höherem Rechenaufwand und geringerer Interpretierbarkeit.
|
|
||||||
Die Trainingsdaten können dafür zufällig gruppiert werden und die Bäume für eine Mehrheitsentscheidung genutzt werden.
|
|
||||||
\subsection{Cross-Entropy}
|
\subsection{Cross-Entropy}
|
||||||
Kullback-Leibler = $H(p,q) - H(p)$
|
Kullback-Leibler = $H(p,q) - H(p)$
|
||||||
as a cost function in machine learning
|
as a cost function in machine learning
|
||||||
@@ -151,8 +146,24 @@ as a cost function in machine learning
|
|||||||
\subsection{Noisy communication channels}
|
\subsection{Noisy communication channels}
|
||||||
The noisy channel coding theorem was stated by \textit{Claude Shannon} in 1948, but first rigorous proof was
|
The noisy channel coding theorem was stated by \textit{Claude Shannon} in 1948, but first rigorous proof was
|
||||||
provided in 1954 by Amiel Feinstein.
|
provided in 1954 by Amiel Feinstein.
|
||||||
It is of foundational to information theory, stating that given a noisy channel with capacity $C$
|
One of the important issues Shannon wanted to tackle with his 'Mathematical theory of commmunication'
|
||||||
and information transmitted at $R$ \cite{enwiki:shannon-hartley}
|
was the insufficient means of transporting discrete data through a noisy channel that were more efficient than
|
||||||
|
the telegram.
|
||||||
|
The means of error correction until then had been limited to very basic means.
|
||||||
|
|
||||||
|
First, analogue connections like the first telephone lines, bypassed the issue altogether and relied
|
||||||
|
on the communicating parties and their brains' ability to filter human voices from the noise that was inevitably transmitted
|
||||||
|
along with the intended signal.
|
||||||
|
After some development, the telegraph in its final form used morse code, a series of long and short clicks, that,
|
||||||
|
together with letter and word gaps, would encode text messages.
|
||||||
|
Even though the long-short coding might appear similar to todays binary coding, the means of error correction were lacking.
|
||||||
|
For a long time, it relied on simply repeating the message multiple times, which is highly inefficient.
|
||||||
|
The destination would then have to determine the most likely intended message by performing a majority vote.
|
||||||
|
One might also propose simply increasing transmitting power, thereby decreasing the error rate of the associated channel.
|
||||||
|
However, the noisy channel coding theorem provides us with a more elegant solution.
|
||||||
|
It is of foundational importance to information theory, stating that given a noisy channel with capacity $C$
|
||||||
|
and information transmitted at rate $R$, there exists an $R<C$ so the error rate at the receiver can be
|
||||||
|
arbitrarily small.
|
||||||
|
|
||||||
\begin{figure}[H]
|
\begin{figure}[H]
|
||||||
\begin{tikzpicture}
|
\begin{tikzpicture}
|
||||||
@@ -183,20 +194,20 @@ and information transmitted at $R$ \cite{enwiki:shannon-hartley}
|
|||||||
\def\boxw{2.5cm}
|
\def\boxw{2.5cm}
|
||||||
\def\n{4}
|
\def\n{4}
|
||||||
\pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
|
\pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
|
||||||
\node (S) at (0,0) [draw, align=center, text width=\boxw] {Information Source};
|
\node (S) at (0,0) [draw, align=center, text width=\boxw] {Transmitter};
|
||||||
\node (S0) at (\boxw + \gap,1) [draw, circle] {0};
|
\node (S0) at (\boxw + \gap,1) [draw, circle] {0};
|
||||||
\node (S1) at (\boxw + \gap,-1) [draw, circle] {1};
|
\node (S1) at (\boxw + \gap,-1) [draw, circle] {1};
|
||||||
\node (D0) at ({2*(\boxw + \gap)},1) [draw, circle] {0};
|
\node (D0) at ({2*(\boxw + \gap)},1) [draw, circle] {0};
|
||||||
\node (D1) at ({2*(\boxw + \gap)},-1) [draw, circle] {1};
|
\node (D1) at ({2*(\boxw + \gap)},-1) [draw, circle] {1};
|
||||||
\node (D) at ({3*(\boxw + \gap)},0) [draw, align=center, text width=\boxw] {Destination};
|
\node (D) at ({3*(\boxw + \gap)},0) [draw, align=center, text width=\boxw] {Receiver};
|
||||||
|
|
||||||
\draw[->] (S) -- (S0);
|
\draw[->] (S) -- (S0);
|
||||||
\draw[->] (S) -- (S1);
|
\draw[->] (S) -- (S1);
|
||||||
|
|
||||||
\draw[->,dashed] (S0) -- (D0) node[midway, above] {$p$};
|
\draw[->,dashed] (S0) -- (D0) node[midway, above] {$1-p$};
|
||||||
\draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$1-p$};
|
\draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$p$};
|
||||||
\draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$1-p$};
|
\draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$p$};
|
||||||
\draw[->,dashed] (S1) -- (D1) node[midway, below] {$p$};
|
\draw[->,dashed] (S1) -- (D1) node[midway, below] {$1-p$};
|
||||||
|
|
||||||
\draw[->] (D0) -- (D);
|
\draw[->] (D0) -- (D);
|
||||||
\draw[->] (D1) -- (D);
|
\draw[->] (D1) -- (D);
|
||||||
@@ -205,4 +216,6 @@ and information transmitted at $R$ \cite{enwiki:shannon-hartley}
|
|||||||
\label{fig:binary-channel}
|
\label{fig:binary-channel}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
|
\printbibliography
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
Reference in New Issue
Block a user