From 3eb0f229fdfb303f781ea3fd1b42f9d55ecbb24b Mon Sep 17 00:00:00 2001 From: eneller Date: Thu, 30 Oct 2025 13:20:34 +0100 Subject: [PATCH] bib --- .gitignore | 1 + entropy.bib | 10 +++---- entropy.tex | 77 +++++++++++++++++++++++++++++++---------------------- 3 files changed, 51 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index 5ec2b2b..0f3f690 100644 --- a/.gitignore +++ b/.gitignore @@ -332,3 +332,4 @@ TSWLatexianTemp* # End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode bib +*.*-SAVE-ERROR diff --git a/entropy.bib b/entropy.bib index e73b5b9..b0dd565 100644 --- a/entropy.bib +++ b/entropy.bib @@ -2,34 +2,34 @@ author = "{Wikipedia contributors}", title = "Shannon–Hartley theorem --- {Wikipedia}{,} The Free Encyclopedia", year = "2025", - howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633}", + url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633", note = "[Online; accessed 29-October-2025]" } @misc{ enwiki:noisy-channel, author = "{Wikipedia contributors}", title = "Noisy-channel coding theorem --- {Wikipedia}{,} The Free Encyclopedia", year = "2025", - howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870}", + url = "https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870", note = "[Online; accessed 29-October-2025]" } @misc{ enwiki:source-coding, author = "{Wikipedia contributors}", title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia", year = "2025", - howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440}", + url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440", note = "[Online; accessed 29-October-2025]" } @misc{ dewiki:nyquist-shannon, author = "Wikipedia", title = "Nyquist-Shannon-Abtasttheorem --- Wikipedia{,} die freie Enzyklopädie", year = "2025", - url = "\url{https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066}", + url = "https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066", note = "[Online; Stand 29. Oktober 2025]" } @misc{ enwiki:information-content, author = "{Wikipedia contributors}", title = "Information content --- {Wikipedia}{,} The Free Encyclopedia", year = "2025", - howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600}", + url = "https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600", note = "[Online; accessed 29-October-2025]" } diff --git a/entropy.tex b/entropy.tex index 5908164..e990fff 100644 --- a/entropy.tex +++ b/entropy.tex @@ -5,6 +5,7 @@ \usepackage{subcaption} \usepackage{parskip} % dont indent after paragraphs, figures \usepackage{xcolor} +%\usepackage{csquotes} % Recommended for biblatex \usepackage{tikz} \usepackage{float} \usepackage{amsmath} @@ -58,9 +59,9 @@ As a measure, Shannon's formula uses the \textit{Bit}, quantifying the efficienc and media for transmission and storage. According to his axioms, a measure for information has to comply with the following criteria: \begin{enumerate} + \item $I(1) = 0$: events that always occur do not communicate information. \item $I(p)$ is monotonically decreasing in p: an increase in the probability of an event decreases the information from an observed event, and vice versa. - \item $I(1) = 0$: events that always occur do not communicate information. \item $I(p_1 \cdot p_2) = I(p_1) + I(p_2)$: the information learned from independent events is the sum of the information learned from each event. \item $I(p)$ is a twice continuously differentiable function of p. @@ -69,7 +70,7 @@ In information theory, entropy can be understood as the expected information of \begin{equation} H = E(I) = - \sum_i p_i \log_2(p_i) \end{equation} -This leaves $ I =log(1/p) = - log_2(p_i)$, implying that an unexpected message (low probability) carries +This leaves $ I =log(1/p_i) = - log_2(p_i)$, implying that an unexpected message (low probability) carries more information than one with higher probability. Intuitively, we can imagine David A. Johnston, a volcanologist reporting day after day that there is no activity on Mount St. Helens. After a while, we grow to expect this message because it is statistically very likely @@ -92,7 +93,7 @@ leaving the required Bits per event source at $\log_2(a^k) \div k = \log_2(a)$. %- Quellentropie \section{Applications} \subsection{Decision Trees} -A decision tree is a supervised learning approach commonly used in data mining. +A decision tree is a supervised learning approach commonly used in machine learning. The goal is to create an algorithm, i.e a series of questions to pose to new data (input variables) in order to predict the target variable, a class label. Graphically, each question can be visualized as a node in a tree, splitting the dataset into two or more groups. @@ -100,15 +101,11 @@ This process is applied to the source set and then its resulting sets in a proce Once a leaf is reached, the class of the input has been successfully determined. In order to build the shallowest possible trees, we want to use input variables that minimize uncertainty. -While other measures for the best choice such as the Gini coefficient exist, +While other measures for the best choice such as the \textit{Gini coefficient} exist, entropy is a popular measure used in decision trees. -Using the previous learnings about information and entropy, we of course want to ask questions that have the -highest information content, so pertaining input variables with the highest entropy. - -Zu berechnen ist also die Entropie des Klassifikators abzüglich der erwarteten Entropie -nach der Aufteilung anhand des Attributs, -erklärt in~\autoref{ex:decisiontree}. +Using what we learned about entropy, we want the maximum decrease in entropy of our target variable, +as explained in~\autoref{ex:decisiontree}. \begin{figure}[H] \centering \begin{minipage}{.3\textwidth} @@ -121,25 +118,23 @@ erklärt in~\autoref{ex:decisiontree}. \end{tabular} \end{minipage} \begin{minipage}{.6\textwidth} -Die Entropie für Regen ist also $H_{prior} = H(\frac{9}{14},\frac{5}{14})$, -nach der Aufteilung anhand des Attributs Temperatur beträgt sie $H_{warm}= H(\frac{4}{7}, \frac{3}{7})$ -und $H_{kalt}= H(\frac{5}{7}, \frac{2}{7})$. -Also ist die erwartete (Erwartungswert) Entropie nach der Aufteilung anhand der Temperatur $p_{warm} * H_{warm} + p_{kalt} * H_{kalt} $ . -Der \textbf{Informationsgewinn} berechnet sich dann als Entropie für Regen abzüglich des Erwartungswertes der Entropie. -Da $H_{prior}$ in dieser Berechnung jedoch konstant ist, kann auch einfach $E[H]$ nach der Aufteilung minimiert werden. +When choosing rain as a target variable, the entropy prior to partitioning is $H_{prior} = H(\frac{9}{14},\frac{5}{14})$, +after partitioning by temperature (hot/cold)$H_{hot}= H(\frac{4}{7}, \frac{3}{7})$ +and $H_{cold}= H(\frac{5}{7}, \frac{2}{7})$ remain. +This leaves us with an expected entropy of +$p_{hot} * H_{hot} + p_{cold} * H_{cold} $ . +The \textbf{information gain} can then be calculated as the difference of entropy proor and post partitioning. +Since $H_{prior}$ is constant in this equation, it is sufficient to minimize post-partitioning $E[H]$. \end{minipage} -\caption{Beispiel Informationsgewinn für Aufbau eines Entscheidungsbaums} +\caption{Example of information gain in decision trees} \label{ex:decisiontree} \end{figure} +Advantages of decision trees over other machine learning approaches include low computation cost and +interpretability, making it a popular choice for many applications. +However, drawbacks include overfitting and poor robustness, where minimal alterations to training data +can lead to a change in tree structure. -Vorteil: Niedriger Rechenaufwand und nachvollziehbarer Aufbau -Nachteil: Overfitting, geringe Robustheit: bereits kleine Änderungen der Trainingsdaten können zu einer Veränderung des Baums führen. - - -Kann zu einem \textbf{Random Forest} (bagging) erweitert werden um die Robustheit zu steigern und Overfitting zu verringern. -Resultiert jedoch in höherem Rechenaufwand und geringerer Interpretierbarkeit. -Die Trainingsdaten können dafür zufällig gruppiert werden und die Bäume für eine Mehrheitsentscheidung genutzt werden. \subsection{Cross-Entropy} Kullback-Leibler = $H(p,q) - H(p)$ as a cost function in machine learning @@ -151,8 +146,24 @@ as a cost function in machine learning \subsection{Noisy communication channels} The noisy channel coding theorem was stated by \textit{Claude Shannon} in 1948, but first rigorous proof was provided in 1954 by Amiel Feinstein. -It is of foundational to information theory, stating that given a noisy channel with capacity $C$ -and information transmitted at $R$ \cite{enwiki:shannon-hartley} +One of the important issues Shannon wanted to tackle with his 'Mathematical theory of commmunication' +was the insufficient means of transporting discrete data through a noisy channel that were more efficient than +the telegram. +The means of error correction until then had been limited to very basic means. + +First, analogue connections like the first telephone lines, bypassed the issue altogether and relied +on the communicating parties and their brains' ability to filter human voices from the noise that was inevitably transmitted +along with the intended signal. +After some development, the telegraph in its final form used morse code, a series of long and short clicks, that, +together with letter and word gaps, would encode text messages. +Even though the long-short coding might appear similar to todays binary coding, the means of error correction were lacking. +For a long time, it relied on simply repeating the message multiple times, which is highly inefficient. +The destination would then have to determine the most likely intended message by performing a majority vote. +One might also propose simply increasing transmitting power, thereby decreasing the error rate of the associated channel. +However, the noisy channel coding theorem provides us with a more elegant solution. +It is of foundational importance to information theory, stating that given a noisy channel with capacity $C$ +and information transmitted at rate $R$, there exists an $R] (S) -- (S0); \draw[->] (S) -- (S1); - \draw[->,dashed] (S0) -- (D0) node[midway, above] {$p$}; - \draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$1-p$}; - \draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$1-p$}; - \draw[->,dashed] (S1) -- (D1) node[midway, below] {$p$}; + \draw[->,dashed] (S0) -- (D0) node[midway, above] {$1-p$}; + \draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$p$}; + \draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$p$}; + \draw[->,dashed] (S1) -- (D1) node[midway, below] {$1-p$}; \draw[->] (D0) -- (D); \draw[->] (D1) -- (D); @@ -205,4 +216,6 @@ and information transmitted at $R$ \cite{enwiki:shannon-hartley} \label{fig:binary-channel} \end{figure} +\printbibliography + \end{document} \ No newline at end of file