@article{LongdeMeloHeetal.2020, author = {Long, Xiang and de Melo, Gerard and He, Dongliang and Li, Fu and Chi, Zhizhen and Wen, Shilei and Gan, Chuang}, title = {Purely attention based local feature integration for video classification}, series = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, volume = {44}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {4}, publisher = {Inst. of Electr. and Electronics Engineers}, address = {Los Alamitos}, issn = {0162-8828}, doi = {10.1109/TPAMI.2020.3029554}, pages = {2140 -- 2154}, year = {2020}, abstract = {Recently, substantial research effort has focused on how to apply CNNs or RNNs to better capture temporal patterns in videos, so as to improve the accuracy of video classification. In this paper, we investigate the potential of a purely attention based local feature integration. Accounting for the characteristics of such features in video classification, we first propose Basic Attention Clusters (BAC), which concatenates the output of multiple attention units applied in parallel, and introduce a shifting operation to capture more diverse signals. Experiments show that BAC can achieve excellent results on multiple datasets. However, BAC treats all feature channels as an indivisible whole, which is suboptimal for achieving a finer-grained local feature integration over the channel dimension. Additionally, it treats the entire local feature sequence as an unordered set, thus ignoring the sequential relationships. To improve over BAC, we further propose the channel pyramid attention schema by splitting features into sub-features at multiple scales for coarse-to-fine sub-feature interaction modeling, and propose the temporal pyramid attention schema by dividing the feature sequences into ordered sub-sequences of multiple lengths to account for the sequential order. Our final model pyramidxpyramid attention clusters (PPAC) combines both channel pyramid attention and temporal pyramid attention to focus on the most important sub-features, while also preserving the temporal information of the video. We demonstrate the effectiveness of PPAC on seven real-world video classification datasets. Our model achieves competitive results across all of these, showing that our proposed framework can consistently outperform the existing local feature integration methods across a range of different scenarios.}, language = {en} } @article{KreowskyStabernack2021, author = {Kreowsky, Philipp and Stabernack, Christian Benno}, title = {A full-featured FPGA-based pipelined architecture for SIFT extraction}, series = {IEEE access : practical research, open solutions / Institute of Electrical and Electronics Engineers}, volume = {9}, journal = {IEEE access : practical research, open solutions / Institute of Electrical and Electronics Engineers}, publisher = {Inst. of Electr. and Electronics Engineers}, address = {New York, NY}, issn = {2169-3536}, doi = {10.1109/ACCESS.2021.3104387}, pages = {128564 -- 128573}, year = {2021}, abstract = {Image feature detection is a key task in computer vision. Scale Invariant Feature Transform (SIFT) is a prevalent and well known algorithm for robust feature detection. However, it is computationally demanding and software implementations are not applicable for real-time performance. In this paper, a versatile and pipelined hardware implementation is proposed, that is capable of computing keypoints and rotation invariant descriptors on-chip. All computations are performed in single precision floating-point format which makes it possible to implement the original algorithm with little alteration. Various rotation resolutions and filter kernel sizes are supported for images of any resolution up to ultra-high definition. For full high definition images, 84 fps can be processed. Ultra high definition images can be processed at 21 fps.}, language = {en} }