@inproceedings{c8310fc0d0bd4036afadb6d5bb0b161e,
title = "ViTAA: Visual-Textual Attributes Alignment in Person Search by Natural Language",
abstract = "Person search by natural language aims at retrieving a specific person in a large-scale image pool that matches given textual descriptions. While most of the current methods treat the task as a holistic visual and textual feature matching one, we approach it from an attribute-aligning perspective that allows grounding specific attribute phrases to the corresponding visual regions. We achieve success as well as a performance boost by a robust feature learning that the referred identity can be accurately bundled by multiple attribute cues. To be concrete, our Visual-Textual Attribute Alignment model (dubbed as ViTAA) learns to disentangle the feature space of a person into sub-spaces corresponding to attributes using a light auxiliary attribute segmentation layer. It then aligns these visual features with the textual attributes parsed from the sentences via a novel contrastive learning loss. We validate our ViTAA framework through extensive experiments on tasks of person search by natural language and by attribute-phrase queries, on which our system achieves state-of-the-art performances. Codes and models are available at https://github.com/Jarr0d/ViTAA.",
keywords = "Metric learning, Person re-identification, Person search by natural language, Vision and language",
author = "Zhe Wang and Zhiyuan Fang and Jun Wang and Yezhou Yang",
note = "Funding Information: Acknowledgements. Vising scholarship support for Z. Wang from the China Scholarship Council #201806020020 and Amazon AWS Machine Learning Research Award (MLRA) support are greatly appreciated. Any opinions, findings, and conclusion or recommendations expressed in this material are those of the authors and do not necessarily reflect the view of the sponsors. Publisher Copyright: {\textcopyright} 2020, Springer Nature Switzerland AG.; 16th European Conference on Computer Vision, ECCV 2020 ; Conference date: 23-08-2020 Through 28-08-2020",
year = "2020",
doi = "10.1007/978-3-030-58610-2_24",
language = "English (US)",
isbn = "9783030586096",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "402--420",
editor = "Andrea Vedaldi and Horst Bischof and Thomas Brox and Jan-Michael Frahm",
booktitle = "Computer Vision – ECCV 2020 - 16th European Conference, Proceedings",
address = "Germany",
}