@InProceedings{ AndrensekEtAl24SiKDD, title = "Connecting company performance to ESG terms in financial reports", author = "Andren{\v{s}}ek, Luka and Sitar {\v{S}}u{\v{s}}tar, Katarina and Pollak, Senja and Purver, Matthew", booktitle = "Proceedings of the {S}lovenian {KDD} Conference", editor = "Grobelnik, Marko and Mladeni{\'{c}}, Dunja", location = "Ljubljana, Slovenia", year = 2024, month = oct, pages = "", isbn = "", url = "https://ailab.ijs.si/dunja/SiKDD2024/Papers/IS2024_-_SIKDD_2024_paper_3.pdf", url = "http://www.eecs.qmul.ac.uk/~mpurver/papers/andrensek-et-al24sikdd.pdf", } @inproceedings{BarnesEtAl2024ICWSM, title={Temporal Network Analysis of Email Communication Patterns in a Long Standing Hierarchy}, volume={18}, url={https://ojs.aaai.org/index.php/ICWSM/article/view/31302}, url="https://doi.org/10.1609/icwsm.v18i1.31302", url="https://arxiv.org/abs/2311.13442", DOI={10.1609/icwsm.v18i1.31302}, booktitle={Proceedings of the International AAAI Conference on Web and Social Media (ICWSM)}, author={Barnes, Matthew R. and Karan, Mladen and McQuistin, Stephen and Perkins, Colin and Tyson, Gareth and Purver, Matthew and Castro, Ignacio and Clegg, Richard G.}, year={2024}, month=may, pages={126-138} }@inproceedings{caporusso-etal-2024-jadt-politics, title = "Analysing Bias in Slovenian News Media: A Computational Comparison Based on Readers' Political Orientation", author = "Caporusso, Jaya and Chatterjee, Nishan and Fivaj{\v{z}}, Zoran and Koloski, Boshko and Ul{\v{c}}ar, Matej and Martinc, Matej and Vezovnik, Andrea and Robnik-{\v{S}}ikonja, Marko and Purver, Matthew and Pollak, Senja", editor = "Dister, Anne and Longr{\'{e}}e, Dominique", booktitle = "Proceedings of the 17es Journ{\'{e}}es internationales d'Analyse statistique des Donn{\'{e}}es Textuelles (JADT): Mots compt{\'{e}}s, textes d{\'{e}}chiffr{\'{e}}s", month = jun, year = "2024", address = "Brussels", publisher = "Presses Universitaires de Louvain", isbn = "978-2-39061-471-5 / 978-2-39061-472-2", annote = "ISBN 978-2-39061-471-5 / 978-2-39061-472-2", url = "http://www.eecs.qmul.ac.uk/~mpurver/papers/caporusso-et-al24jadt-politics.pdf", pages = "159--168", abstract = "This paper presents a split of a Slovenian news corpus based on the readers’ political leaning. By combining Slovenian news data with a large survey giving data on media consumption and self-reported political orientation, we create sub-corpora of news outlets consumed by left-, centre, and right-leaning readers and use it to build a political orientation classifier. Following prior work analysing dehumanisation in text, we then investigate the similarity between the migrants and LGBTQIA+ community social groups with the concept of moral disgust, taking into account the gender variable, across sub-corpora. Our main findings include the fact that female members of the target groups, migrants and LGBTQIA+ community, are more closely associated with moral disgust in the right-wing model.", } @inproceedings{caporusso-etal-2024-jadt-self, title = "A Phenomenologically-Inspired Computational Analysis of Self-Categories in Text", author = "Caporusso, Jaya and Koloski, Boshko and Rebernik, Ma{\v{s}}a and Pollak, Senja and Purver, Matthew", editor = "Dister, Anne and Longr{\'{e}}e, Dominique", booktitle = "Proceedings of the 17es Journ{\'{e}}es internationales d'Analyse statistique des Donn{\'{e}}es Textuelles (JADT): Mots compt{\'{e}}s, textes d{\'{e}}chiffr{\'{e}}s", month = jun, year = "2024", address = "Brussels", publisher = "Presses Universitaires de Louvain", isbn = "978-2-39061-471-5 / 978-2-39061-472-2", annote = "ISBN 978-2-39061-471-5 / 978-2-39061-472-2", url = "http://www.eecs.qmul.ac.uk/~mpurver/papers/caporusso-et-al24jadt-self.pdf", pages = "169--178", abstract = "The self is a pervasive aspect of human experience, influencing crucial areas like mental health and manifesting in the texts we produce. Previous research indicates a significant correlation between the use of self-related expressions --- terms and linguistic structures individuals use to refer to themselves, such as first-person pronouns --- and various personal attributes, including personality traits, mental states, and psychological disorders. These findings enable the construction of simple yet explainable and effective representations, which can be later utilised for downstream tasks like classification, clustering, and segmentation. We present an approach to investigate the self in text data in a more detailed manner, expanding its understanding by adopting aspects of the self as defined by cognitive science and phenomenology. We employ the large language model GPT3.5 to classify text as to whether it presents these self-aspects, and we analyse the obtained splits with LIWC-22. This exploratory study aims to bridge the gap between the knowledge about using self-references in text, Natural Language Processing techniques and applications, and the phenomenological understanding(s) of the self, opening new venues in all three directions.", } @inproceedings{caporusso-etal-2024-computational-analysis, title = "A Computational Analysis of the Dehumanisation of Migrants from Syria and {U}kraine in {S}lovene News Media", author = "Caporusso, Jaya and Hoogland, Damar and Brglez, Mojca and Koloski, Boshko and Purver, Matthew and Pollak, Senja", editor = "Calzolari, Nicoletta and Kan, Min-Yen and Hoste, Veronique and Lenci, Alessandro and Sakti, Sakriani and Xue, Nianwen", booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)", month = may, year = "2024", address = "Torino, Italia", publisher = "ELRA and ICCL", url = "https://aclanthology.org/2024.lrec-main.18", pages = "199--210", abstract = "Dehumanisation involves the perception and/or treatment of a social group{'}s members as less than human. This phenomenon is rarely addressed with computational linguistic techniques. We adapt a recently proposed approach for English, making it easier to transfer to other languages and to evaluate, introducing a new sentiment resource, the use of zero-shot cross-lingual valence and arousal detection, and a new method for statistical significance testing. We then apply it to study attitudes to migration expressed in Slovene newspapers, to examine changes in the Slovene discourse on migration between the 2015-16 migration crisis following the war in Syria and the 2022-23 period following the war in Ukraine. We find that while this discourse became more negative and more intense over time, it is less dehumanising when specifically addressing Ukrainian migrants compared to others.", } @inproceedings{ghinassi-etal-2024-recent, title = "Recent Trends in Linear Text Segmentation: A Survey", author = "Ghinassi, Iacopo and Wang, Lin and Newell, Chris and Purver, Matthew", editor = "Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung", booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024", month = nov, year = "2024", address = "Miami, Florida, USA", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.findings-emnlp.174/", doi = "10.18653/v1/2024.findings-emnlp.174", pages = "3084--3095", abstract = "Linear Text Segmentation is the task of automatically tagging text documents with topic shifts, i.e. the places in the text where the topics change. A well-established area of research in Natural Language Processing, drawing from well-understood concepts in linguistic and computational linguistic research, the field has recently seen a lot of interest as a result of the surge of text, video, and audio available on the web, which in turn require ways of summarising and categorizing the mole of content for which linear text segmentation is a fundamental step. In this survey, we provide an extensive overview of current advances in linear text segmentation, describing the state of the art in terms of resources and approaches for the task. Finally, we highlight the limitations of available resources and of the task itself, while indicating ways forward based on the most recent literature and under-explored research directions." } @inproceedings{ghinassi-etal-2024-cohesion, title = "When Cohesion Lies in the Embedding Space: Embedding-Based Reference-Free Metrics for Topic Segmentation", author = "Ghinassi, Iacopo and Wang, Lin and Newell, Chris and Purver, Matthew", editor = "Calzolari, Nicoletta and Kan, Min-Yen and Hoste, Veronique and Lenci, Alessandro and Sakti, Sakriani and Xue, Nianwen", booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)", month = may, year = "2024", address = "Torino, Italia", publisher = "ELRA and ICCL", url = "https://aclanthology.org/2024.lrec-main.1524/", pages = "17525--17536", abstract = "In this paper we propose a new framework and new methods for the reference-free evaluation of topic segmentation systems directly in the embedding space. Specifically, we define a common framework for reference-free, embedding-based topic segmentation metrics, and show how this applies to an existing metric. We then define new metrics, based on a previously defined cohesion score, Average Relative Proximity. Using this approach, we show that Large Language Models (LLMs) yield features that, if used correctly, can strongly correlate with traditional topic segmentation metrics based on costly and rare human annotations, while outperforming existing reference-free metrics borrowed from clustering evaluation in most domains. We then show that smaller language models specifically fine-tuned for different sentence-level tasks can outperform LLMs several orders of magnitude larger. Via a thorough comparison of our metric`s performance across different datasets, we see that conversational data present the biggest challenge in this framework. Finally, we analyse the behaviour of our metrics in specific error cases, such as those of under-generation and moving of ground truth topic boundaries, and show that our metrics behave more consistently than other reference-free methods." } @ARTICLE{GkoumasEtAl24LRE, author = {Gkoumas, Dimitris and Wang, Bo and Tsakalidis, Adam and Wolters, Maria and Purver, Matthew and Zubiaga, Arkaitz and Liakata, Maria}, title = {A Longitudinal Multi-Modal Dataset for Dementia Monitoring and Diagnosis}, journal = {Language Resources and Evaluation}, volume = 58, pages = "883-902", year = 2024, month = mar, publisher = {Springer}, issn = "1574-0218", doi = {10.1007/s10579-023-09718-4}, annote = "ISSN 1574-0218", url = {https://arxiv.org/abs/2109.01537}, url = {https://doi.org/10.1007/s10579-023-09718-4} } @proceedings{eacl-2024-european-chapter-association-linguistics, title = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)", editor = "Graham, Yvette and Purver, Matthew", month = mar, year = "2024", address = "St. Julian{'}s, Malta", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.eacl-long.0", } @proceedings{eacl-2024-european-chapter-association-linguistics-2, title = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)", editor = "Graham, Yvette and Purver, Matthew", month = mar, year = "2024", address = "St. Julian{'}s, Malta", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.eacl-short.0", } @ARTICLE{HealeyEtAl24Frontiers, author = {Healey, Patrick G.T. and Khare, Prashant and Tyson, Gareth and Karan, Mladen and Castro, Ignacio and Shekhar, Ravi and McQuistin, Stephen and Perkins, Colin and Purver, Matthew}, title = {Power and Vulnerability: Managing Sensitive Language in Organisational Communication}, journal = {Frontiers in Psychology}, volume = 14, year = 2024, pages = "1266425", month = feb, publisher = {Frontiers Media s.a.}, issn = "1664-1078", doi = {10.3389/fpsyg.2023.1266425}, annote = "ISSN 1664-1078", url = {http://www.plosone.org/}, url = {http://doi.org/10.3389/fpsyg.2023.1266425} } @inproceedings{ivacic-etal-2024-comparing, title = "Comparing News Framing of Migration Crises using Zero-Shot Classification", author = "Iva{\v{c}}i{\v{c}}, Nikola and Purver, Matthew and Lind, Fabienne and Pollak, Senja and Boomgaarden, Hajo and Bajt, Veronika", editor = "Sommerauer, Pia and Caselli, Tommaso and Nissim, Malvina and Remijnse, Levi and Vossen, Piek", booktitle = "Proceedings of the First Workshop on Reference, Framing, and Perspective @ LREC-COLING 2024", month = may, year = "2024", address = "Torino, Italia", publisher = "ELRA and ICCL", url = "https://aclanthology.org/2024.rfp-1.3/", pages = "18--27", abstract = "We present an experiment on classifying news frames in a language unseen by the learner, using zero-shot cross-lingual transfer learning. We used two pre-trained multilingual Transformer Encoder neural network models and tested with four specific news frames, investigating two approaches to the resulting multi-label task: Binary Relevance (treating each frame independently) and Label Power-set (predicting each possible combination of frames). We train our classifiers on an available annotated multilingual migration news dataset and test on an unseen Slovene language migration news corpus, first evaluating performance and then using the classifiers to analyse how media framed the news during the periods of Syria and Ukraine conflict-related migrations." } @inproceedings{li-etal-2024-analyzing, title = "Analyzing and Enhancing Clarification Strategies for Ambiguous References in Consumer Service Interactions", author = "Li, Changling and Gan, Yujian and Yang, Zhenrong and Chen, Youyang and Qiu, Xinxuan and Lin, Yanni and Purver, Matthew and Poesio, Massimo", editor = "Kawahara, Tatsuya and Demberg, Vera and Ultes, Stefan and Inoue, Koji and Mehri, Shikib and Howcroft, David and Komatani, Kazunori", booktitle = "Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue", month = sep, year = "2024", address = "Kyoto, Japan", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.sigdial-1.25/", doi = "10.18653/v1/2024.sigdial-1.25", pages = "289--296", abstract = "When customers present ambiguous references, service staff typically need to clarify the customers' specific intentions. To advance research in this area, we collected 1,000 real-world consumer dialogues with ambiguous references. This dataset will be used for subsequent studies to identify ambiguous references and generate responses. Our analysis of the dataset revealed common strategies employed by service staff, including directly asking clarification questions (CQ) and listing possible options before asking a clarification question (LCQ). However, we found that merely using CQ often fails to fully satisfy customers. In contrast, using LCQ, as well as recommending specific products after listing possible options, proved more effective in resolving ambiguous references and enhancing customer satisfaction." } @inproceedings{nakwijit-etal-2024-encoder, title = "How do Encoder-only LMs Predict Closeness and Respect from Thai Conversations?", author = "Nakwijit, Pakawat and Rutherford, Attapol T. and Purver, Matthew", booktitle = "Proceedings of the 28th Workshop on the Semantics and Pragmatics of Dialogue - Full Papers", month = sep, year = "2024", address = "Trento, Italy", publisher = "SEMDIAL", url = "http://semdial.org/anthology/Z24-Nakwijit_semdial_0005.pdf", }@inproceedings{pelicon-etal-2024-denoising, title = "Denoising Labeled Data for Comment Moderation Using Active Learning", author = "Pelicon, Andra{\v{z}} and Karan, Vanja Mladen and Shekhar, Ravi and Purver, Matthew and Pollak, Senja", editor = "Calzolari, Nicoletta and Kan, Min-Yen and Hoste, Veronique and Lenci, Alessandro and Sakti, Sakriani and Xue, Nianwen", booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)", month = may, year = "2024", address = "Torino, Italia", publisher = "ELRA and ICCL", url = "https://aclanthology.org/2024.lrec-main.413/", pages = "4626--4633", abstract = "Noisily labeled textual data is ample on internet platforms that allow user-created content. Training models, such as offensive language detection models for comment moderation, on such data may prove difficult as the noise in the labels prevents the model to converge. In this work, we propose to use active learning methods for the purposes of denoising training data for model training. The goal is to sample examples the most informative examples with noisy labels with active learning and send them to the oracle for reannotation thus reducing the overall cost of reannotation. In this setting we tested three existing active learning methods, namely DBAL, Variance of Gradients (VoG) and BADGE. The proposed approach to data denoising is tested on the problem of offensive language detection. We observe that active learning can be effectively used for the purposes of data denoising, however care should be taken when choosing the algorithm for this purpose." } @inproceedings{WrightPurver24CAOS, title = "The Geography of Temperature Space", author = "Wright, George and Purver, Matthew", booktitle = "Proceedings of CAOS: Cognition And OntologieS at the Joint Ontology Workshops (JOWO)", year = 2024, month = jul, location = "Enschede", url = "https://ceur-ws.org/Vol-3882/caos8-2.pdf", }