|
1 |
| -# id_sarcasm |
| 1 | +# IdSarcasm: Benchmarking and Evaluating Language Models for Indonesian Sarcasm Detection |
| 2 | + |
| 3 | +This project aims to benchmark and evaluate various language models for sarcasm detection in Indonesian. We experiment with classical machine learning models, fine-tuned transformer models, and zero-shot classification using large language models. All of our models, datasets, and results are openly available via HuggingFace Hub. |
| 4 | + |
| 5 | +<div align="center"> |
| 6 | + |
| 7 | +<a href="https://huggingface.co/collections/w11wo/indonesian-sarcasm-detection-65840069489f3b53a0452c04"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Collections-yellow"></img></a> |
| 8 | + |
| 9 | +</div> |
| 10 | + |
| 11 | +## Pre-trained Models |
| 12 | + |
| 13 | +| Base Model | #params | Reddit | Twitter | |
| 14 | +| ---------------------- | :-----: | ------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------- | |
| 15 | +| IndoNLU IndoBERT Base | 124M | [IndoNLU IndoBERT Base Reddit](https://huggingface.co/w11wo/indobert-base-p1-reddit-indonesia-sarcastic) | [IndoNLU IndoBERT Base Twitter](https://huggingface.co/w11wo/indobert-base-p1-twitter-indonesia-sarcastic) | |
| 16 | +| IndoNLU IndoBERT Large | 335M | [IndoNLU IndoBERT Large Reddit](https://huggingface.co/w11wo/indobert-large-p1-reddit-indonesia-sarcastic) | [IndoNLU IndoBERT Large Twitter](https://huggingface.co/w11wo/indobert-large-p1-twitter-indonesia-sarcastic) | |
| 17 | +| IndoLEM IndoBERT Base | 111M | [IndoLEM IndoBERT Base Reddit](https://huggingface.co/w11wo/indobert-base-uncased-reddit-indonesia-sarcastic) | [IndoLEM IndoBERT Base Twitter](https://huggingface.co/w11wo/indobert-base-uncased-twitter-indonesia-sarcastic) | |
| 18 | +| mBERT Base | 178M | [mBERT Base Reddit](https://huggingface.co/w11wo/bert-base-multilingual-cased-reddit-indonesia-sarcastic) | [mBERT Base Twitter](https://huggingface.co/w11wo/bert-base-multilingual-cased-twitter-indonesia-sarcastic) | |
| 19 | +| XLM-R Base | 278M | [XLM-R Base Reddit](https://huggingface.co/w11wo/xlm-roberta-base-reddit-indonesia-sarcastic) | [XLM-R Base Twitter](https://huggingface.co/w11wo/xlm-roberta-base-twitter-indonesia-sarcastic) | |
| 20 | +| XLM-R Large | 560M | [XLM-R Large Reddit](https://huggingface.co/w11wo/xlm-roberta-large-reddit-indonesia-sarcastic) | [XLM-R Large Twitter](https://huggingface.co/w11wo/xlm-roberta-large-twitter-indonesia-sarcastic) | |
| 21 | + |
| 22 | +## Dataset |
| 23 | + |
| 24 | +We used two datasets for training and evaluation, including a novel dataset of Reddit comments and a Twitter dataset. The Reddit dataset consists of 14,116 comments, while the Twitter dataset consists of 12,861 tweets. |
| 25 | + |
| 26 | +| Dataset | Link | |
| 27 | +| --------------------------- | -------------------------------------------------------------------------------- | |
| 28 | +| Reddit Indonesia Sarcastic | [HuggingFace](https://huggingface.co/datasets/w11wo/reddit_indonesia_sarcastic) | |
| 29 | +| Twitter Indonesia Sarcastic | [HuggingFace](https://huggingface.co/datasets/w11wo/twitter_indonesia_sarcastic) | |
| 30 | + |
| 31 | +## Results |
| 32 | + |
| 33 | +We compared the performance of various models on both the Reddit and Twitter datasets. The evaluation metric used is the F1-score. |
| 34 | + |
| 35 | +| Model | Reddit F1-score | Twitter F1-score | |
| 36 | +| ------------------------ | :-------------: | :--------------: | |
| 37 | +| **Classical** | | | |
| 38 | +| Logistic Regression | 0.4887 | 0.7142 | |
| 39 | +| Naive Bayes | 0.4591 | 0.6721 | |
| 40 | +| SVC | 0.4467 | 0.6782 | |
| 41 | +| **Fine-tuning** | | | |
| 42 | +| IndoBERT Base (IndoNLU) | 0.6100 | 0.7273 | |
| 43 | +| IndoBERT Large (IndoNLU) | 0.6184 | 0.7160 | |
| 44 | +| IndoBERT Base (IndoLEM) | 0.5671 | 0.6462 | |
| 45 | +| mBERT | 0.5338 | 0.6467 | |
| 46 | +| XLM-R Base | 0.5690 | 0.7386 | |
| 47 | +| XLM-R Large | **0.6274** | **0.7692** | |
| 48 | +| **Zero-shot** | | | |
| 49 | +| BLOOMZ-560M | 0.3870 | 0.3916 | |
| 50 | +| BLOOMZ-1.1B | 0.3944 | 0.3987 | |
| 51 | +| BLOOMZ-1.7B | 0.3758 | 0.3885 | |
| 52 | +| BLOOMZ-3B | 0.4000 | 0.3847 | |
| 53 | +| BLOOMZ-7.1B | 0.4036 | 0.3968 | |
| 54 | +| mT0 Small | 0.4000 | 0.3988 | |
| 55 | +| mT0 Base | 0.3990 | 0.3985 | |
| 56 | +| mT0 Large | 0.3998 | 0.3989 | |
| 57 | +| mT0 XL | 0.4001 | 0.3988 | |
| 58 | + |
| 59 | +## Citation |
| 60 | + |
| 61 | +If you use this work in your research, please cite: |
| 62 | + |
| 63 | +```bibtex |
| 64 | +@article{10565877, |
| 65 | + author = {Suhartono, Derwin and Wongso, Wilson and Tri Handoyo, Alif}, |
| 66 | + journal = {IEEE Access}, |
| 67 | + title = {IdSarcasm: Benchmarking and Evaluating Language Models for Indonesian Sarcasm Detection}, |
| 68 | + year = {2024}, |
| 69 | + volume = {12}, |
| 70 | + number = {}, |
| 71 | + pages = {87323-87332}, |
| 72 | + keywords = {Social networking (online);Blogs;Machine learning;Feature extraction;Accuracy;Deep learning;Electronic mail;Natural language processing;Sentiment analysis;Low-resource data;low-resource languages;Indonesian sarcasm detection;natural language processing;sarcasm detection;sentiment analysis}, |
| 73 | + doi = {10.1109/ACCESS.2024.3416955} |
| 74 | +} |
| 75 | +``` |
| 76 | + |
| 77 | +## Author |
| 78 | + |
| 79 | +<a href="https://github.com/w11wo"> |
| 80 | + <img src="https://github.com/w11wo.png" alt="GitHub Profile" style="border-radius: 50%;width: 64px;border: solid 1px #fff;margin:0 4px;"> |
| 81 | +</a> |
| 82 | + |
| 83 | +## References |
| 84 | + |
| 85 | +```bibtex |
| 86 | +@inproceedings{10.1145/3406601.3406624, |
| 87 | + author = {Khotijah, Siti and Tirtawangsa, Jimmy and Suryani, Arie A.}, |
| 88 | + title = {Using LSTM for Context Based Approach of Sarcasm Detection in Twitter}, |
| 89 | + year = {2020}, |
| 90 | + isbn = {9781450377591}, |
| 91 | + publisher = {Association for Computing Machinery}, |
| 92 | + address = {New York, NY, USA}, |
| 93 | + url = {https://doi.org/10.1145/3406601.3406624}, |
| 94 | + doi = {10.1145/3406601.3406624}, |
| 95 | + booktitle = {Proceedings of the 11th International Conference on Advances in Information Technology}, |
| 96 | + articleno = {19}, |
| 97 | + numpages = {7}, |
| 98 | + keywords = {context, Sarcasm detection, paragraph2vec, lstm, deep learning}, |
| 99 | + location = {, Bangkok, Thailand, }, |
| 100 | + series = {IAIT '20} |
| 101 | +} |
| 102 | +
|
| 103 | +@article{Ranti2020IndonesianSD, |
| 104 | + title={Indonesian Sarcasm Detection Using Convolutional Neural Network}, |
| 105 | + author={Kiefer Stefano Ranti and Abba Suganda Girsang}, |
| 106 | + journal={International Journal of Emerging Trends in Engineering Research}, |
| 107 | + year={2020}, |
| 108 | + url={https://doi.org/10.30534/ijeter/2020/10892020} |
| 109 | +} |
| 110 | +
|
| 111 | +@article{academicReddit, |
| 112 | + title= {Reddit comments/submissions 2005-06 to 2023-09}, |
| 113 | + journal= {}, |
| 114 | + author= {stuck_in_the_matrix, Watchful1, RaiderBDev}, |
| 115 | + year= {}, |
| 116 | + url= {}, |
| 117 | + abstract= {Reddit comments and submissions from 2005-06 to 2023-09 collected by pushshift and u/RaiderBDev. These are zstandard compressed ndjson files. Example python scripts for parsing the data can be found here https://github.com/Watchful1/PushshiftDumps}, |
| 118 | + keywords= {reddit}, |
| 119 | + terms= {}, |
| 120 | + license= {}, |
| 121 | + superseded= {} |
| 122 | +} |
| 123 | +
|
| 124 | +@inproceedings{abu-farha-etal-2022-semeval, |
| 125 | + title = "{S}em{E}val-2022 Task 6: i{S}arcasm{E}val, Intended Sarcasm Detection in {E}nglish and {A}rabic", |
| 126 | + author = "Abu Farha, Ibrahim and |
| 127 | + Oprea, Silviu Vlad and |
| 128 | + Wilson, Steven and |
| 129 | + Magdy, Walid", |
| 130 | + editor = "Emerson, Guy and |
| 131 | + Schluter, Natalie and |
| 132 | + Stanovsky, Gabriel and |
| 133 | + Kumar, Ritesh and |
| 134 | + Palmer, Alexis and |
| 135 | + Schneider, Nathan and |
| 136 | + Singh, Siddharth and |
| 137 | + Ratan, Shyam", |
| 138 | + booktitle = "Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)", |
| 139 | + month = jul, |
| 140 | + year = "2022", |
| 141 | + address = "Seattle, United States", |
| 142 | + publisher = "Association for Computational Linguistics", |
| 143 | + url = "https://aclanthology.org/2022.semeval-1.111", |
| 144 | + doi = "10.18653/v1/2022.semeval-1.111", |
| 145 | + pages = "802--814", |
| 146 | +} |
| 147 | +``` |
0 commit comments