hkcancor

참고자료:

TFDS에 이 데이터세트를 로드하려면 다음 명령어를 사용하세요.

ds = tfds.load('huggingface:hkcancor')

설명 :

The Hong Kong Cantonese Corpus (HKCanCor) comprise transcribed conversations
recorded between March 1997 and August 1998. It contains recordings of
spontaneous speech (51 texts) and radio programmes (42 texts),
which involve 2 to 4 speakers, with 1 text of monologue.

In total, the corpus contains around 230,000 Chinese words.
The text is word-segmented, annotated with part-of-speech (POS) tags and
romanised Cantonese pronunciation.

Romanisation scheme - Linguistic Society of Hong Kong (LSHK)
POS scheme - Peita-Fujitsu-Renmin Ribao (PRF) corpus (Duan et al., 2000),
             with extended tags for Cantonese-specific phenomena added by
             Luke and Wang (see original paper for details).

라이센스 : CC BY 4.0
버전 : 1.0.0
분할 :

나뉘다	예
`'train'`	10801

특징 :

{
    "conversation_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "speaker": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "turn_number": {
        "dtype": "int16",
        "id": null,
        "_type": "Value"
    },
    "tokens": {
        "feature": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "transcriptions": {
        "feature": {
            "dtype": "string",
            "id": null,
            "_type": "Value"
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "pos_tags_prf": {
        "feature": {
            "num_classes": 120,
            "names": [
                "!",
                "\"",
                "#",
                "'",
                ",",
                "-",
                ".",
                "...",
                "?",
                "A",
                "AD",
                "AG",
                "AIRWAYS0",
                "AN",
                "AND",
                "B",
                "BG",
                "BEAN0",
                "C",
                "CENTRE0",
                "CG",
                "D",
                "D1",
                "DG",
                "E",
                "ECHO0",
                "F",
                "G",
                "G1",
                "G2",
                "H",
                "HILL0",
                "I",
                "IG",
                "J",
                "JB",
                "JM",
                "JN",
                "JNS",
                "JNT",
                "JNZ",
                "K",
                "KONG",
                "L",
                "L1",
                "LG",
                "M",
                "MG",
                "MONTY0",
                "MOUNTAIN0",
                "N",
                "N1",
                "NG",
                "NR",
                "NS",
                "NSG",
                "NT",
                "NX",
                "NZ",
                "O",
                "P",
                "PEPPER0",
                "Q",
                "QG",
                "R",
                "RG",
                "S",
                "SOUND0",
                "T",
                "TELECOM0",
                "TG",
                "TOUCH0",
                "U",
                "UG",
                "U0",
                "V",
                "V1",
                "VD",
                "VG",
                "VK",
                "VN",
                "VU",
                "VUG",
                "W",
                "X",
                "XA",
                "XB",
                "XC",
                "XD",
                "XE",
                "XJ",
                "XJB",
                "XJN",
                "XJNT",
                "XJNZ",
                "XJV",
                "XJA",
                "XL1",
                "XM",
                "XN",
                "XNG",
                "XNR",
                "XNS",
                "XNT",
                "XNX",
                "XNZ",
                "XO",
                "XP",
                "XQ",
                "XR",
                "XS",
                "XT",
                "XV",
                "XVG",
                "XVN",
                "XX",
                "Y",
                "YG",
                "Y1",
                "Z"
            ],
            "names_file": null,
            "id": null,
            "_type": "ClassLabel"
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    },
    "pos_tags_ud": {
        "feature": {
            "num_classes": 16,
            "names": [
                "DET",
                "PRON",
                "VERB",
                "NOUN",
                "ADJ",
                "PUNCT",
                "INTJ",
                "ADV",
                "V",
                "PART",
                "X",
                "NUM",
                "PROPN",
                "AUX",
                "CCONJ",
                "ADP"
            ],
            "names_file": null,
            "id": null,
            "_type": "ClassLabel"
        },
        "length": -1,
        "id": null,
        "_type": "Sequence"
    }
}