{"id":2317,"identifier":"data/V9CXPR","persistentUrl":"https://doi.org/10.11588/data/V9CXPR","protocol":"doi","authority":"10.11588","publisher":"heiDATA","publicationDate":"2019-02-06","storageIdentifier":"file://10.11588/data/V9CXPR","datasetVersion":{"id":370,"datasetId":2317,"datasetPersistentId":"doi:10.11588/data/V9CXPR","storageIdentifier":"file://10.11588/data/V9CXPR","versionNumber":1,"versionMinorNumber":0,"versionState":"RELEASED","lastUpdateTime":"2019-02-06T14:22:43Z","releaseTime":"2019-02-06T14:22:43Z","createTime":"2019-01-28T14:31:44Z","publicationDate":"2019-02-06","citationDate":"2019-02-06","termsOfUse":"Licensed under a Creative Commons Attribution 4.0 International License. ","fileAccessRequest":false,"metadataBlocks":{"citation":{"displayName":"Citation Metadata","name":"citation","fields":[{"typeName":"title","multiple":false,"typeClass":"primitive","value":"BPEmb: Pre-trained Subword Embeddings in 275 Languages (LREC 2018)"},{"typeName":"alternativeURL","multiple":false,"typeClass":"primitive","value":"https://nlp.h-its.org/bpemb/"},{"typeName":"author","multiple":true,"typeClass":"compound","value":[{"authorName":{"typeName":"authorName","multiple":false,"typeClass":"primitive","value":"Heinzerling, Benjamin"},"authorAffiliation":{"typeName":"authorAffiliation","multiple":false,"typeClass":"primitive","value":"Heidelberg University and Natural Language Processing (NLP) Group at the Heidelberg Institute for Theoretical Studies (HITS)"}}]},{"typeName":"datasetContact","multiple":true,"typeClass":"compound","value":[{"datasetContactName":{"typeName":"datasetContactName","multiple":false,"typeClass":"primitive","value":"Heinzerling, Benjamin"},"datasetContactAffiliation":{"typeName":"datasetContactAffiliation","multiple":false,"typeClass":"primitive","value":"Heidelberg University and Natural Language Processing (NLP) Group at the Heidelberg Institute for Theoretical Studies (HITS)"},"datasetContactEmail":{"typeName":"datasetContactEmail","multiple":false,"typeClass":"primitive","value":"benjamin.heinzerling@h-its.org"}}]},{"typeName":"dsDescription","multiple":true,"typeClass":"compound","value":[{"dsDescriptionValue":{"typeName":"dsDescriptionValue","multiple":false,"typeClass":"primitive","value":"BPEmb is a collection of pre-trained subword unit embeddings in 275 languages, based on Byte-Pair Encoding (BPE).\r\nIn an evaluation using fine-grained entity typing as testbed, BPEmb performs competitively, and for some languages better\r\nthan alternative subword approaches, while requiring vastly fewer resources and no tokenization."}}]},{"typeName":"subject","multiple":true,"typeClass":"controlledVocabulary","value":["Computer and Information Science"]},{"typeName":"keyword","multiple":true,"typeClass":"compound","value":[{"keywordValue":{"typeName":"keywordValue","multiple":false,"typeClass":"primitive","value":"subword embeddings"}},{"keywordValue":{"typeName":"keywordValue","multiple":false,"typeClass":"primitive","value":"byte-pair encoding"}},{"keywordValue":{"typeName":"keywordValue","multiple":false,"typeClass":"primitive","value":"multilingual"}}]},{"typeName":"publication","multiple":true,"typeClass":"compound","value":[{"publicationCitation":{"typeName":"publicationCitation","multiple":false,"typeClass":"primitive","value":"Heinzerling, B. & Strube, M. (2018). BPEmb: Tokenization-free Pre-trained Subword Embeddings in 275 Languages. In Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018). Miyazaki, Japan. European Language Resources Association (ELRA)."},"publicationIDType":{"typeName":"publicationIDType","multiple":false,"typeClass":"controlledVocabulary","value":"url"},"publicationURL":{"typeName":"publicationURL","multiple":false,"typeClass":"primitive","value":"http://www.lrec-conf.org/proceedings/lrec2018/pdf/1049.pdf"}}]},{"typeName":"notesText","multiple":false,"typeClass":"primitive","value":"This dataset is split into 275 archives, one for each language. Languages are identified via their Wikipedia ID, e.g. \"en\" for English or \"de\" for German. Each archives contains three kinds of files:\r\n\r\n