blbooks

مراجع:

همه

برای بارگذاری این مجموعه داده در TFDS از دستور زیر استفاده کنید:

ds = tfds.load('huggingface:blbooks/all')
  • توضیحات :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • مجوز : مجوز شناخته شده ای وجود ندارد
  • نسخه : 1.0.2
  • تقسیمات :
تقسیم کنید نمونه ها
'train' 14011953
  • ویژگی ها :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1800

برای بارگذاری این مجموعه داده در TFDS از دستور زیر استفاده کنید:

ds = tfds.load('huggingface:blbooks/1800s')
  • توضیحات :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • مجوز : مجوز شناخته شده ای وجود ندارد
  • نسخه : 1.0.2
  • تقسیمات :
تقسیم کنید نمونه ها
'train' 13781747
  • ویژگی ها :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1700

برای بارگذاری این مجموعه داده در TFDS از دستور زیر استفاده کنید:

ds = tfds.load('huggingface:blbooks/1700s')
  • توضیحات :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • مجوز : مجوز شناخته شده ای وجود ندارد
  • نسخه : 1.0.2
  • تقسیمات :
تقسیم کنید نمونه ها
'train' 178224
  • ویژگی ها :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1510_1699

برای بارگذاری این مجموعه داده در TFDS از دستور زیر استفاده کنید:

ds = tfds.load('huggingface:blbooks/1510_1699')
  • توضیحات :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • مجوز : مجوز شناخته شده ای وجود ندارد
  • نسخه : 1.0.2
  • تقسیمات :
تقسیم کنید نمونه ها
'train' 51982
  • ویژگی ها :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "timestamp[s]",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1500_1899

برای بارگذاری این مجموعه داده در TFDS از دستور زیر استفاده کنید:

ds = tfds.load('huggingface:blbooks/1500_1899')
  • توضیحات :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • مجوز : مجوز شناخته شده ای وجود ندارد
  • نسخه : 1.0.2
  • تقسیمات :
تقسیم کنید نمونه ها
'train' 14011953
  • ویژگی ها :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "timestamp[s]",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1800_1899

برای بارگذاری این مجموعه داده در TFDS از دستور زیر استفاده کنید:

ds = tfds.load('huggingface:blbooks/1800_1899')
  • توضیحات :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • مجوز : مجوز شناخته شده ای وجود ندارد
  • نسخه : 1.0.2
  • تقسیمات :
تقسیم کنید نمونه ها
'train' 13781747
  • ویژگی ها :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "timestamp[s]",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}

1700_1799

برای بارگذاری این مجموعه داده در TFDS از دستور زیر استفاده کنید:

ds = tfds.load('huggingface:blbooks/1700_1799')
  • توضیحات :
A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.
The books cover a wide range of subject areas including philosophy, history, poetry and literature.
  • مجوز : مجوز شناخته شده ای وجود ندارد
  • نسخه : 1.0.2
  • تقسیمات :
تقسیم کنید نمونه ها
'train' 178224
  • ویژگی ها :
{
    "record_id": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "date": {
        "dtype": "timestamp[s]",
        "id": null,
        "_type": "Value"
    },
    "raw_date": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "title": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "place": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "empty_pg": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    },
    "text": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "pg": {
        "dtype": "int32",
        "id": null,
        "_type": "Value"
    },
    "mean_wc_ocr": {
        "dtype": "float32",
        "id": null,
        "_type": "Value"
    },
    "std_wc_ocr": {
        "dtype": "float64",
        "id": null,
        "_type": "Value"
    },
    "name": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all_names": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Publisher": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Country of publication 1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "all Countries of publication": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Physical description": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_1": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_2": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_3": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "Language_4": {
        "dtype": "string",
        "id": null,
        "_type": "Value"
    },
    "multi_language": {
        "dtype": "bool",
        "id": null,
        "_type": "Value"
    }
}