Commit d14ece55 authored by uoega's avatar uoega
Browse files

paper ver 2.4

parent 90377209
......@@ -59,7 +59,7 @@
@misc{ma2019nlpaug,
title={NLP Augmentation},
author={Edward Ma},
howpublished={https://github.com/makcedward/nlpaug},
howpublished={https://github.com/makced ward/nlpaug},
year={2019}
}
......
......@@ -20,8 +20,8 @@
\citation{kopuk2019realtime}
\citation{estevam2020zeroshot}
\citation{perez2017effectiveness}
\@writefile{toc}{\contentsline {section}{\numberline {1}\hskip -1em.\nobreakspace {}Introduction}{1}{section.1}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {2}\hskip -1em.\nobreakspace {}Method}{1}{section.2}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {1}\hskip -1em.\nobreakspace {}Introduction}{1}{section.1}}
\@writefile{toc}{\contentsline {section}{\numberline {2}\hskip -1em.\nobreakspace {}Method}{1}{section.2}}
\newlabel{method}{{2}{1}{\hskip -1em.~Method}{section.2}{}}
\citation{jasani2019skeleton}
\citation{jasani2019skeleton}
......@@ -36,29 +36,29 @@
\citation{sung2018learning}
\citation{sung2018learning}
\citation{sung2018learning}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Overview of the network modules.}}{2}{figure.1}\protected@file@percent }
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Overview of the network modules.}}{2}{figure.1}}
\newlabel{architecture}{{1}{2}{Overview of the network modules}{figure.1}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}\hskip -1em.\nobreakspace {}Architecture}{2}{subsection.2.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}\hskip -1em.\nobreakspace {}Augmentation}{2}{subsection.2.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.1}Descriptive labels}{2}{subsubsection.2.2.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}\hskip -1em.\nobreakspace {}Architecture}{2}{subsection.2.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}\hskip -1em.\nobreakspace {}Augmentation}{2}{subsection.2.2}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.1}Descriptive labels}{2}{subsubsection.2.2.1}}
\citation{ma2019nlpaug}
\citation{liu2019roberta}
\citation{Liu_2020}
\citation{reimers2019sentencebert}
\citation{gupta2021syntactically}
\citation{sung2018learning}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Three descriptive labels for the class "Squat down".}}{3}{table.1}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Three descriptive labels for the class "Squat down".}}{3}{table.1}}
\newlabel{tab:multi_label}{{1}{3}{Three descriptive labels for the class "Squat down"}{table.1}{}}
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Descriptive label and two automatic augmentations for "Squat down".}}{3}{table.2}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Descriptive label and two automatic augmentations for "Squat down".}}{3}{table.2}}
\newlabel{tab:auto_aug}{{2}{3}{Descriptive label and two automatic augmentations for "Squat down"}{table.2}{}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.2}Multiple labels per class}{3}{subsubsection.2.2.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.3}Automatic augmentation}{3}{subsubsection.2.2.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}\hskip -1em.\nobreakspace {}Experiments}{3}{subsection.2.3}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces ZSL and GZSL results for different approaches.}}{3}{table.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.2}Multiple labels per class}{3}{subsubsection.2.2.2}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.2.3}Automatic augmentation}{3}{subsubsection.2.2.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}\hskip -1em.\nobreakspace {}Experiments}{3}{subsection.2.3}}
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces ZSL and GZSL results for different approaches.}}{3}{table.3}}
\newlabel{tab:ZSL_GZSL}{{3}{3}{ZSL and GZSL results for different approaches}{table.3}{}}
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Unseen top-1 and top-5 accuracies in detail.}}{3}{table.4}\protected@file@percent }
\newlabel{tab:top1_top5}{{4}{3}{Unseen top-1 and top-5 accuracies in detail}{table.4}{}}
\@writefile{toc}{\contentsline {section}{\numberline {3}\hskip -1em.\nobreakspace {}Results}{3}{section.3}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Unseen top-1 and top-5 accuracies (GZSL).}}{3}{table.4}}
\newlabel{tab:top1_top5}{{4}{3}{Unseen top-1 and top-5 accuracies (GZSL)}{table.4}{}}
\@writefile{toc}{\contentsline {section}{\numberline {3}\hskip -1em.\nobreakspace {}Results}{3}{section.3}}
\citation{jasani2019skeleton}
\citation{ma2019nlpaug}
\bibstyle{ieee_fullname}
......@@ -66,11 +66,11 @@
\bibcite{cao2019openpose}{1}
\bibcite{estevam2020zeroshot}{2}
\bibcite{gupta2021syntactically}{3}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}\hskip -1em.\nobreakspace {}Discussion}{4}{subsection.3.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}From default to descriptive labels}{4}{subsubsection.3.1.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.2}Using multiple labels}{4}{subsubsection.3.1.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.3}Automatic augmentation}{4}{subsubsection.3.1.3}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}\hskip -1em.\nobreakspace {}Conclusion}{4}{section.4}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}\hskip -1em.\nobreakspace {}Discussion}{4}{subsection.3.1}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}From default to descriptive labels}{4}{subsubsection.3.1.1}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.2}Using multiple labels}{4}{subsubsection.3.1.2}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.3}Automatic augmentation}{4}{subsubsection.3.1.3}}
\@writefile{toc}{\contentsline {section}{\numberline {4}\hskip -1em.\nobreakspace {}Conclusion}{4}{section.4}}
\bibcite{jasani2019skeleton}{4}
\bibcite{kopuk2019realtime}{5}
\bibcite{Liu_2020}{6}
......@@ -82,4 +82,3 @@
\bibcite{reimers2019sentencebert}{12}
\bibcite{sung2018learning}{13}
\bibcite{yan2018spatial}{14}
\gdef \@abspage@last{5}
......@@ -46,7 +46,7 @@ Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
\bibitem{ma2019nlpaug}
Edward Ma.
\newblock Nlp augmentation.
\newblock https://github.com/makcedward/nlpaug, 2019.
\newblock https://github.com/makced ward/nlpaug, 2019.
\bibitem{marinov2021pose2drone}
Zdravko Marinov, Stanka Vasileva, Qing Wang, Constantin Seibold, Jiaming Zhang,
......
This diff is collapsed.
......@@ -168,11 +168,11 @@ To reduce the manual annotation effort, we would like to generate additional lab
In this work we use the NTU RGB+D 120 dataset \cite{Liu_2020}, which contains 3D skeleton data for 114,480 samples of 120 different human action classes. To evaluate our model we pick a subset of 40 gestures classes to execute four performance tests: one with our default labels as a baseline, and one per augmentation method. A performance test consists of eight training runs on 35/5 (seen/unseen) splits, which are randomized in such a way that every single class is unseen in exactly one training run.
During training, only the weights of the AN and RN modules are adjusted. The GCN is trained beforehand on the 80 unused classes of the NTU dataset to ensure that the unseen gestures have not already appeared in the training process at some early point before inference and the SBERT module has been trained in \cite{reimers2019sentencebert}.
After testing, the accuracies are averaged over the eight individual experiments. For each augmentation method we test the performance in two scenarios: In the ZSL scenario, the model only predicts on the unseen classes, while it predicts on all classes (seen and unseen) in the GZSL scenario. In the latter we measure the accuracy for seen and unseen samples, as well as the harmonic mean, following recent works \cite{gupta2021syntactically}. For default and descriptive labels, we train our Network with a batch size of 32 and without batch norm, as was done in the original paper \cite{sung2018learning}. For the multi labels however, we used a batch size of 128 and batch norm at the input of the RN.
During a training run, only the weights of the AN and RN modules are adjusted. The GCN is trained beforehand on the 80 unused classes of the NTU dataset to ensure that the unseen gestures have not appeared in the training process at some early stage. The SBERT module has already been trained on large text corpora by \cite{reimers2019sentencebert}.
For testing, the accuracies are averaged over the eight individual experiments of a performance test. We test the performance in two scenarios for each augmentation method: In the ZSL scenario, the model only predicts on the unseen classes, while it predicts on all classes (seen and unseen) in the GZSL scenario. In the latter we measure the accuracy for seen and unseen samples, as well as the harmonic mean, following recent works \cite{gupta2021syntactically}. For default and descriptive labels, we train our Network with a batch size of 32 and without batch norm, as was done in the original paper \cite{sung2018learning}. When using multiple labels, we instead use a batch size of 128 and batch norm at the input of the RN.
This was mainly done due to performance reasons because the multi label approach with more than three labels did not learn anything without batch norm at all. %batchnorm in general -> decrease in unseen
\section{Results}
......@@ -180,7 +180,7 @@ This was mainly done due to performance reasons because the multi label approach
\begin{center}
\begin{tabular}{|l|c|c|c|c|}
\hline
Approach & ZSL & Seen & Unseen & h\\
Augmentation & ZSL & Seen & Unseen & h\\
\hline\hline
Baseline & 0.4739 & 0.8116 & 0.1067 & 0.1877\\
Descriptive & 0.5186 & 0.8104 & 0.1503 & 0.2495\\
......@@ -197,7 +197,7 @@ This was mainly done due to performance reasons because the multi label approach
\begin{center}
\begin{tabular}{|l|c|c|}
\hline
Approach & top-1${\pm}$ std & top-5 ${\pm}$ std \\
Augmentation & top-1${\pm}$ std & top-5 ${\pm}$ std \\
\hline\hline
Baseline & ${0.1067\pm 0.0246}$ & ${0.5428\pm 0.0840}$ \\
Descriptive & ${0.1503\pm 0.0553}$ & ${0.6460\pm 0.1250}$ \\
......@@ -206,21 +206,31 @@ This was mainly done due to performance reasons because the multi label approach
\hline
\end{tabular}
\end{center}
\caption{Unseen top-1 and top-5 accuracies in detail.}
\caption{Unseen top-1 and top-5 accuracies (GZSL).}
\label{tab:top1_top5}
\end{table}
All our results were generated following the procedure described in the Experiments section. In table \ref{tab:ZSL_GZSL}, one can see the ZSL, seen and unseen accuracies, as well as the harmonic mean. Table \ref{tab:top1_top5} shows a more detailed view of the achieved unseen accuracies. It shows the top-1 and top-5 accuracies for our approaches with their standard deviation over the 8 splits. Improvements on the ZSL accuracy, the unseen accuracy and the harmonic mean were achieved using the descriptive labels and even more so with the three descriptive labels approach. Using only one manually created descriptive label and four automatic augmentations of this description in a multi label approach performs worse compared to three descriptive labels, but still constitutes a relative 23\% increase over using only one descriptive label. The seen accuracy is quite similar for all approaches; still, it is slightly higher for the multi labels approaches, which occurred due to the use of batch norm, which always raised the seen accuracy slightly at the cost of lowering the unseen accuracy. \\
In the more detailed table one can see, that the top-5 accuracies increases similarly to their top-1 counterparts, with the exception of a less severe performance decrease when using automatic augmentations. This behavior was often observed for experiments with the multi label approach. As for the standard deviations, one can see that for the top-1 accuracies all approaches based on the descriptive labels are in the same range. For the top-5 accuracies we even get a decrease in standard deviation with higher accuracy values which indicates a higher consistency for the multi label approach.
All our results are generated following the procedure described in the Experiments section. In table \ref{tab:ZSL_GZSL} one can see the ZSL, seen and unseen accuracies, as well as the harmonic mean. Table \ref{tab:top1_top5} displays a more detailed view of the achieved unseen accuracies. It shows the top-1 and top-5 accuracies for our approaches with their standard deviations (std) over the eight splits.\\
Improvements on the ZSL accuracy, the unseen accuracy and the harmonic mean were achieved using the descriptive labels and even more so with the three descriptive labels approach. Using only one manually created descriptive label and four automatic augmentations of this description in a multiple label approach, performs worse compared to three descriptive labels. But it still constitutes a relative 23\% increase over using only one descriptive label. The seen accuracy only experiences a marginal increase for the two cases that use multiple labels. This behaviour is observed whenever batch normalization is applied to any of our approaches along with a decrease in unseen accuracy. Therefore it is only applied in the cases were multiple labels are used because they require batch normalization in order for the training to converge. \\
Table \ref{tab:top1_top5} shows that the top-5 accuracies behave similarly to their top-1 counterparts, with the exception of a less severe decrease when using automatic augmentations. The standard deviations of the top-1 accuracies are in the same range for all approaches based on the descriptive labels. The standard deviation belonging to the top-5 accuracies decreases for the multiple label aproaches, which indicates a higher prediction consistency.
\subsection{Discussion}
\subsubsection{From default to descriptive labels}
The improvement from the use of descriptive labels over the use of the default labels shows that incorporating more visual information into the semantic embedding by using visual descriptions as class labels helps the network to find a general relation between the semantic and the visual space learned only on the seen training data. Plainly speaking the network can find more similarities between the describing sentences compared to just one-word labels. One would expect this to already be possible on the default labels due to the use of text embeddings, but the issue there lies with the way that the embedding modules are trained. The embeddings of the class labels 'sit down' and 'drink water' might be somewhat similar, because those words appear together frequently in the large training text corpora, but visually those classes look vastly different from each other. The embeddings falsely suggest, that a similarity between the classes is there, which is less likely to happen if the embeddings are created from visual descriptions of the actions.
The improvement from the use of descriptive labels shows that incorporating more visual information into the semantic embeddings helps the network to find a general relation between the semantic and the visual space based only on the seen training data. Plainly speaking the network can find more similarities between the class labels. This is important since the assumed visual features of an unseen class are determined based on the similarities between its label and the seen labels.
One might expect these similarities to also be present in the embeddings of the default labels because SBERT should be able to generate a representative embeddings that share characteristics with similar classes.
While such similarities are present in the SBERT embeddings, they are not focused on the visual appearance of the gestures.
For example, the embeddings of the class labels 'sit down' and 'drink water' might be somewhat similar, because those words appear together frequently in the large training text corpora (which SBERT was trained on), but visually those classes look vastly different from each other. The embeddings falsely suggest, that a similarity between the classes is there, which is less likely to happen if the embeddings are created from visual descriptions of the actions.
%this to already be possible on the default labels due to the use of text embeddings, but the issue there lies with the way that the embedding modules are trained. The embeddings of the class labels 'sit down' and 'drink water' might be somewhat similar, because those words appear together frequently in the large training text corpora, but visually those classes look vastly different from each other. The embeddings falsely suggest, that a similarity between the classes is there, which is less likely to happen if the embeddings are created from visual descriptions of the actions. Usually this should already be enabled by using text embedding techniques that were trained on large text corpora to find semantic relationships. But the problem with that is that the texts it was trained on contains the words used to describe motions in many different contexts and usually not visually describing it. The main reason for this is, that most humans do not need e.g. an explanation on what “stand up” looks like. But for our task the visual relationships are needed which could explain why using descriptive labels leads to improvements.
\subsubsection{Using multiple labels}
%vielleicht kurz auf die Bedeutung der Batchsize eingehen 32->128 weil mehr Klassen
%vielleicht kurz auf die Bedeutung der Batchsize eingehen 32->128 weil mehr Klassen since the unseen labels can only be categorized based on the relations between and the seen labels. to also be present in the default label by using visual descriptions as class labels
For the "multiple labels" approach the idea is somewhat different. The main motivation here was that using more data is generally a good idea. In our case the network is forced to learn a more general mapping between the semantic and the visual feature space since the descriptions and therefore also the embeddings change randomly during training. It has to adapt to the greater diversity of the used label semantic embeddings. This improved generalization on seen training data then helps to better understand and also classify the unseen samples.
\subsubsection{Automatic augmentation}
......@@ -228,12 +238,12 @@ As described in chapter \ref{method}, using automatic augmentation methods intro
\section{Conclusion}
In this work, we highlighted the importance of the semantic embeddings in the context of skeleton based zero-shot gesture recognition by showing how the performance can increase based only on the augmentation of those embeddings. By including more visual information in the class labels and combining multiple descriptions per class we could improve the model based on \cite{jasani2019skeleton} by a significant margin. The use of automatic text augmentation methods like \cite{ma2019nlpaug} already reduces the effort of manual annotation significantly, while maintaining most of the performance gain.
In this work, we highlight the importance of the semantic embeddings in the context of skeleton based zero-shot gesture recognition by showing how the performance can increase based only on the augmentation of those embeddings. By including more visual information in the class labels and combining multiple descriptions per class we can improve the model based on \cite{jasani2019skeleton} by a significant margin. The use of automatic text augmentation methods like \cite{ma2019nlpaug} already reduces the effort of manual annotation significantly, while maintaining most of the performance gain.
Future works could further investigate the following topics: First, generating descriptive sentences from the default labels using methods from Natural Language Processing (NLP) could be implemented to further reduce the manual annotation effort. Second, additional tests on different zero-shot architectures to verify the improvements shown in our work could be performed. Finally, different kinds or combinations of automatic text augmentation methods could be evaluated.
Future works might further investigate the following topics: First, generating descriptive sentences from the default labels using methods from Natural Language Processing (NLP) could be implemented to further reduce the manual annotation effort. Second, additional tests on different zero-shot architectures to verify the improvements shown in our work can be performed. Finally, different kinds or combinations of automatic text augmentation methods can be evaluated.
With these advances, data augmentation of the semantic embedding in Zero-Shot Learning could prove useful in optimizing the performance of any Zero-Shot approach in the future.
With these advances, data augmentation of the semantic embeddings in Zero-Shot learning can prove useful in optimizing the performance of any Zero-Shot approach in the future.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment