{"dcterms:modified":"2024-01-18","dcterms:creator":"heiDATA","@type":"ore:ResourceMap","schema:additionalType":"Dataverse OREMap Format v1.0.0","dvcore:generatedBy":{"@type":"schema:SoftwareApplication","schema:name":"Dataverse","schema:version":"6.1 build 1590-f5d1299","schema:url":"https://github.com/iqss/dataverse"},"@id":"https://heidata.uni-heidelberg.de/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.11588/data/VK99LU","ore:describes":{"citation:topicClassification":{"citation:topicClassValue":"knowledge discovery"},"author":[{"citation:authorName":"Nastase, Vivi","citation:authorAffiliation":"Department of Computational Linguistics, Heidelberg University, Germany"},{"citation:authorName":"Hitschler, Julian","citation:authorAffiliation":"Department of Computational Linguistics, Heidelberg University, Germany"}],"citation:dsDescription":{"citation:dsDescriptionValue":"The data in this collection consists of two parallel directories, one (\"raw\") containing the raw text of 18850 articles from the ACL 2013/02 collection, the other (\"re-segmented\") the word-resegmented version of these articles, obtained using nematus, a seq2seq neural model used for machine translation. The motivation for the work was that spurious spaces in the text seemed to be very common, particularly in older papers, obtained by OCR-ing scanned papers."},"publication":{"publicationCitation":"
Nastase, V. and Hitschler, J. (2018). Correction of OCR word segmentation errors in articles from the ACL collection through neural machine translation methods. In Proceedings of the 11th International Conference on Language Resources and Evaluation, pages 706–711, 7-12 May 2018, Miyazaki, Japan.
","publicationIDType":"url","publicationIDNumber":"https://www.cl.uni-heidelberg.de/english/research/downloads/resource_pages/ACL_corrected/lrec2018_correction-ocr-word.pdf","publicationURL":"https://www.cl.uni-heidelberg.de/english/research/downloads/resource_pages/ACL_corrected/lrec2018_correction-ocr-word.pdf"},"citation:datasetContact":{"citation:datasetContactName":"Nastase, Vivi","citation:datasetContactAffiliation":"Department of Computational Linguistics, Heidelberg University, Germany","citation:datasetContactEmail":"nastase@cl.uni-heidelberg.de"},"citation:keyword":[{"citation:keywordValue":"character-level sequence-to-sequence model"},{"citation:keywordValue":"word segmentation"},{"citation:keywordValue":"ACL collection"}],"language":"English","subject":["Arts and Humanities","Computer and Information Science"],"alternativeURL":"https://www.cl.uni-heidelberg.de/english/research/downloads/resource_pages/ACL_corrected/ACL_corrected.shtml","citation:productionDate":"2018","kindOfData":"textual data","citation:productionPlace":"Heidelberg University","title":"ACL word segmentation correction","@id":"https://doi.org/10.11588/data/VK99LU","@type":["ore:Aggregation","schema:Dataset"],"schema:version":"1.1","schema:name":"ACL word segmentation correction","schema:dateModified":"Thu Sep 12 14:14:23 CEST 2019","schema:datePublished":"2019-07-15","schema:creativeWorkStatus":"RELEASED","dvcore:termsOfUse":"Licensed under a Creative Commons Attribution 4.0 International License. ","dvcore:fileTermsOfAccess":{"dvcore:fileRequestAccess":false},"schema:includedInDataCatalog":"heiDATA","schema:isPartOf":{"schema:name":"Empirical Linguistics and Computational Language Modeling (LiMo)","@id":"https://heidata.uni-heidelberg.de/dataverse/lwc-limo","schema:description":"Data publications of the Leibniz ScienceCampus “Empirical Linguistics and Computational Language Modeling”
\r\nThe Leibniz ScienceCampus “Empirical Linguistics and Computational Language Modeling” (LiMo) is a cooperative research project between the Leibniz Institute for the German Language (Leibniz-Institut für Deutsche Sprache, IDS) in Mannheim and the Department of Computational Linguistics at Heidelberg University (ICL). The general aims of the project are to develop new methods, models, and tools for compiling and analysing automatically large German textual corpora covering different domains, genres and language varieties.
\r\nThe project is supported by funds from the Baden-Württemberg Ministry of Science, Research and the Arts and the Leibniz Association together with funds provided by the Leibniz Institute for the German Language and Heidelberg University.
\r\nFunding Period: 2015 – 2020
\r\n","schema:isPartOf":{"schema:name":"heiDATA","@id":"https://heidata.uni-heidelberg.de/dataverse/root","schema:description":"heiDATA is an institutional repository for Open Research Data from Heidelberg University. It is managed by the Competence Centre for Research Data, a joint institution of the University Library and the Computing Centre. If you are interested in publishing your data here, please have a look at our user guide and get in touch with us. Detailed information on heiDATA can be found in our service description."}},"ore:aggregates":[{"schema:description":"text files","schema:name":"acl-201302_word-resegmented.tar.gz","dvcore:restricted":false,"schema:version":2,"dvcore:datasetVersionId":457,"dvcore:categories":["Data"],"@id":"doi:10.11588/data/VK99LU/NZI6HW","schema:sameAs":"https://heidata.uni-heidelberg.de/api/access/datafile/:persistentId?persistentId=doi:10.11588/data/VK99LU/NZI6HW","@type":"ore:AggregatedResource","schema:fileFormat":"application/gzip","dvcore:filesize":389091782,"dvcore:storageIdentifier":"file://16be02152ef-88252f381f9f","dvcore:rootDataFileId":-1,"dvcore:checksum":{"@type":"MD5","@value":"96d089771cde56bb9ac5296189fb403b"}},{"schema:name":"README","dvcore:restricted":false,"schema:version":2,"dvcore:datasetVersionId":457,"dvcore:categories":["Documentation"],"@id":"doi:10.11588/data/VK99LU/QEQI1P","schema:sameAs":"https://heidata.uni-heidelberg.de/api/access/datafile/:persistentId?persistentId=doi:10.11588/data/VK99LU/QEQI1P","@type":"ore:AggregatedResource","schema:fileFormat":"text/plain; charset=US-ASCII","dvcore:filesize":782,"dvcore:storageIdentifier":"file://16be0201ca9-629afc559656","dvcore:rootDataFileId":-1,"dvcore:checksum":{"@type":"MD5","@value":"b305fd3ce016837f601aa137fd8ecf63"}}],"schema:hasPart":["doi:10.11588/data/VK99LU/NZI6HW","doi:10.11588/data/VK99LU/QEQI1P"]},"@context":{"alternativeURL":"https://schema.org/distribution","author":"http://purl.org/dc/terms/creator","citation":"https://dataverse.org/schema/citation/","dcterms":"http://purl.org/dc/terms/","dvcore":"https://dataverse.org/schema/core#","kindOfData":"http://rdf-vocabulary.ddialliance.org/discovery#kindOfData","language":"http://purl.org/dc/terms/language","ore":"http://www.openarchives.org/ore/terms/","publication":"http://purl.org/dc/terms/isReferencedBy","publicationCitation":"http://purl.org/dc/terms/bibliographicCitation","publicationIDNumber":"http://purl.org/spar/datacite/ResourceIdentifier","publicationIDType":"http://purl.org/spar/datacite/ResourceIdentifierScheme","publicationURL":"https://schema.org/distribution","schema":"http://schema.org/","subject":"http://purl.org/dc/terms/subject","title":"http://purl.org/dc/terms/title"}}