bib

2025-10-30 13:20:34 +01:00
parent 90aa504539
commit 3eb0f229fd
3 changed files with 51 additions and 37 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -332,3 +332,4 @@ TSWLatexianTemp*
 # End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode
 bib
 *.*-SAVE-ERROR
--- a/entropy.bib
+++ b/entropy.bib
@@ -2,34 +2,34 @@
    author = "{Wikipedia contributors}",
    title = "Shannon–Hartley theorem --- {Wikipedia}{,} The Free Encyclopedia",
    year = "2025",
-    howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633}",
+    url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633",
    note = "[Online; accessed 29-October-2025]"
  }
  @misc{ enwiki:noisy-channel,
    author = "{Wikipedia contributors}",
    title = "Noisy-channel coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
    year = "2025",
-    howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870}",
+    url = "https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870",
    note = "[Online; accessed 29-October-2025]"
  }
  @misc{ enwiki:source-coding,
    author = "{Wikipedia contributors}",
    title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
    year = "2025",
-    howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440}",
+    url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440",
    note = "[Online; accessed 29-October-2025]"
  }
 @misc{ dewiki:nyquist-shannon,
   author = "Wikipedia",
   title = "Nyquist-Shannon-Abtasttheorem --- Wikipedia{,} die freie Enzyklopädie",
   year = "2025",
-   url = "\url{https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066}",
+   url = "https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066",
   note = "[Online; Stand 29. Oktober 2025]"
 }
  @misc{ enwiki:information-content,
    author = "{Wikipedia contributors}",
    title = "Information content --- {Wikipedia}{,} The Free Encyclopedia",
    year = "2025",
-    howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600}",
+    url = "https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600",
    note = "[Online; accessed 29-October-2025]"
  }
--- a/entropy.tex
+++ b/entropy.tex
@@ -5,6 +5,7 @@
 \usepackage{subcaption}
 \usepackage{parskip} % dont indent after paragraphs, figures
 \usepackage{xcolor}
 %\usepackage{csquotes} % Recommended for biblatex
 \usepackage{tikz}
 \usepackage{float}
 \usepackage{amsmath}
@@ -58,9 +59,9 @@ As a measure, Shannon's formula uses the \textit{Bit}, quantifying the efficienc
 and media for transmission and storage.
 According to his axioms, a measure for information has to comply with the following criteria:
 \begin{enumerate}
    \item $I(1) = 0$: events that always occur do not communicate information.
    \item $I(p)$ is monotonically decreasing in p: an increase in the probability of an event
        decreases the information from an observed event, and vice versa.
    \item $I(1) = 0$: events that always occur do not communicate information.
    \item $I(p_1 \cdot p_2) = I(p_1) + I(p_2)$: the information learned from independent events
        is the sum of the information learned from each event.
    \item $I(p)$ is a twice continuously differentiable function of p.
@@ -69,7 +70,7 @@ In information theory, entropy can be understood as the expected information of
 \begin{equation}
    H = E(I) = - \sum_i p_i \log_2(p_i)
 \end{equation}
-This leaves $ I =log(1/p) = - log_2(p_i)$, implying that an unexpected message (low probability) carries
+This leaves $ I =log(1/p_i) = - log_2(p_i)$, implying that an unexpected message (low probability) carries
 more information than one with higher probability.
 Intuitively, we can imagine David A. Johnston, a volcanologist reporting day after day that there is no
 activity on Mount St. Helens. After a while, we grow to expect this message because it is statistically very likely
@@ -92,7 +93,7 @@ leaving the required Bits per event source at $\log_2(a^k) \div k = \log_2(a)$.
 %- Quellentropie
 \section{Applications}
 \subsection{Decision Trees}
-A decision tree is a supervised learning approach commonly used in data mining.
+A decision tree is a supervised learning approach commonly used in machine learning.
 The goal is to create an algorithm, i.e a series of questions to pose to new data (input variables)
 in order to predict the target variable, a class label.
 Graphically, each question can be visualized as a node in a tree, splitting the dataset into two or more groups.
@@ -100,15 +101,11 @@ This process is applied to the source set and then its resulting sets in a proce
 Once a leaf is reached, the class of the input has been successfully determined.
 In order to build the shallowest possible trees, we want to use input variables that minimize uncertainty.
-While other measures for the best choice such as the Gini coefficient exist,
+While other measures for the best choice such as the \textit{Gini coefficient} exist,
 entropy is a popular measure used in decision trees.
-Using the previous learnings about information and entropy, we of course want to ask questions that have the
+Using what we learned about entropy, we want the maximum decrease in entropy of our target variable,
-highest information content, so pertaining input variables with the highest entropy.
+as explained in~\autoref{ex:decisiontree}.
 Zu berechnen ist also die Entropie des Klassifikators abzüglich der erwarteten Entropie
 nach der Aufteilung anhand des Attributs, 
 erklärt in~\autoref{ex:decisiontree}.
 \begin{figure}[H]
 \centering
 \begin{minipage}{.3\textwidth}
@@ -121,25 +118,23 @@ erklärt in~\autoref{ex:decisiontree}.
 \end{tabular}
 \end{minipage}
 \begin{minipage}{.6\textwidth}
-Die Entropie für Regen ist also $H_{prior} = H(\frac{9}{14},\frac{5}{14})$, 
+When choosing rain as a target variable, the entropy prior to partitioning is $H_{prior} = H(\frac{9}{14},\frac{5}{14})$, 
-nach der Aufteilung anhand des Attributs Temperatur beträgt sie $H_{warm}= H(\frac{4}{7}, \frac{3}{7})$
+after partitioning by temperature (hot/cold)$H_{hot}= H(\frac{4}{7}, \frac{3}{7})$
-und $H_{kalt}= H(\frac{5}{7}, \frac{2}{7})$.
+and $H_{cold}= H(\frac{5}{7}, \frac{2}{7})$ remain.
-Also ist die erwartete (Erwartungswert) Entropie nach der Aufteilung anhand der Temperatur $p_{warm} * H_{warm} + p_{kalt} * H_{kalt} $ .
+This leaves us with an expected entropy of
-Der \textbf{Informationsgewinn} berechnet sich dann als Entropie für Regen abzüglich des Erwartungswertes der Entropie.
+$p_{hot} * H_{hot} + p_{cold} * H_{cold} $ .
-Da $H_{prior}$ in dieser Berechnung jedoch konstant ist, kann auch einfach $E[H]$ nach der Aufteilung minimiert werden.
+The \textbf{information gain} can then be calculated as the difference of entropy proor and post partitioning.
 Since $H_{prior}$ is constant in this equation, it is sufficient to minimize post-partitioning $E[H]$.
 \end{minipage}
-\caption{Beispiel Informationsgewinn für Aufbau eines Entscheidungsbaums}
+\caption{Example of information gain in decision trees}
 \label{ex:decisiontree}
 \end{figure}
 Advantages of decision trees over other machine learning approaches include low computation cost and
 interpretability, making it a popular choice for many applications.
 However, drawbacks include overfitting and poor robustness, where minimal alterations to training data
 can lead to a change in tree structure.
 Vorteil: Niedriger Rechenaufwand und nachvollziehbarer Aufbau 
 Nachteil: Overfitting, geringe Robustheit: bereits kleine Änderungen der Trainingsdaten können zu einer Veränderung des Baums führen.
 Kann zu einem \textbf{Random Forest} (bagging) erweitert werden um die Robustheit zu steigern und Overfitting zu verringern.
 Resultiert jedoch in höherem Rechenaufwand und geringerer Interpretierbarkeit.
 Die Trainingsdaten können dafür zufällig gruppiert werden und die Bäume für eine Mehrheitsentscheidung genutzt werden.
 \subsection{Cross-Entropy}
 Kullback-Leibler = $H(p,q) - H(p)$
 as a cost function in machine learning
@@ -151,8 +146,24 @@ as a cost function in machine learning
 \subsection{Noisy communication channels}
 The noisy channel coding theorem was stated by \textit{Claude Shannon} in 1948, but first rigorous proof was
 provided in 1954 by Amiel Feinstein.
-It is of foundational to information theory, stating that given a noisy channel with capacity $C$
+One of the important issues Shannon wanted to tackle with his 'Mathematical theory of commmunication'
-and information transmitted at $R$ \cite{enwiki:shannon-hartley}
+was the insufficient means of transporting discrete data through a noisy channel that were more efficient than
 the telegram.
 The means of error correction until then had been limited to very basic means.
 First, analogue connections like the first telephone lines, bypassed the issue altogether and relied
 on the communicating parties and their brains' ability to filter human voices from the noise that was inevitably transmitted
 along with the intended signal.
 After some development, the telegraph in its final form used morse code, a series of long and short clicks, that,
 together with letter and word gaps, would encode text messages.
 Even though the long-short coding might appear similar to todays binary coding, the means of error correction were lacking.
 For a long time, it relied on simply repeating the message multiple times, which is highly inefficient.
 The destination would then have to determine the most likely intended message by performing a majority vote.
 One might also propose simply increasing transmitting power, thereby decreasing the error rate of the associated channel.
 However, the noisy channel coding theorem provides us with a more elegant solution.
 It is of foundational importance to information theory, stating that given a noisy channel with capacity $C$
 and information transmitted at rate $R$, there exists an $R<C$ so the error rate at the receiver can be
 arbitrarily small.
 \begin{figure}[H]
 \begin{tikzpicture}
@@ -183,20 +194,20 @@ and information transmitted at $R$ \cite{enwiki:shannon-hartley}
    \def\boxw{2.5cm}
    \def\n{4}
    \pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
-    \node (S) at (0,0) [draw, align=center, text width=\boxw] {Information Source};
+    \node (S) at (0,0) [draw, align=center, text width=\boxw] {Transmitter};
    \node (S0) at (\boxw + \gap,1) [draw, circle] {0};
    \node (S1) at (\boxw + \gap,-1) [draw, circle] {1};
    \node (D0) at ({2*(\boxw + \gap)},1) [draw, circle] {0};
    \node (D1) at ({2*(\boxw + \gap)},-1) [draw, circle] {1};
-    \node (D) at ({3*(\boxw + \gap)},0) [draw, align=center, text width=\boxw] {Destination};
+    \node (D) at ({3*(\boxw + \gap)},0) [draw, align=center, text width=\boxw] {Receiver};
    \draw[->] (S) -- (S0);
    \draw[->] (S) -- (S1);
-    \draw[->,dashed] (S0) -- (D0) node[midway, above] {$p$};
+    \draw[->,dashed] (S0) -- (D0) node[midway, above] {$1-p$};
-    \draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$1-p$};
+    \draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$p$};
-    \draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$1-p$};
+    \draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$p$};
-    \draw[->,dashed] (S1) -- (D1) node[midway, below] {$p$};
+    \draw[->,dashed] (S1) -- (D1) node[midway, below] {$1-p$};
    \draw[->] (D0) -- (D);
    \draw[->] (D1) -- (D);
@@ -205,4 +216,6 @@ and information transmitted at $R$ \cite{enwiki:shannon-hartley}
 \label{fig:binary-channel}
 \end{figure}
 \printbibliography
 \end{document}