[ [ [ "core", "DC_expMatrix_DCnMono.tab.gz" ], { "name": "DC_expMatrix_DCnMono.tab", "description": "A data set of 1140 cells from human blood samples. The included cell types are dendritic cells (DCs) with mutants overexpressed for marker genes CD141+ or CD1C+, a double negative mutant CD11C+/CD141-/CD1C-, monocytes and plasmacytoid DCs. Gene expression is measured as raw count data on 26,593 genes.", "collection": "GEO", "version": "1.3", "year": 2017, "instances": 1140, "missing": 0, "variables": 26595, "source": "NCBI", "url": "https://datasets.biolab.si/DC_expMatrix_DCnMono.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "DC_expMatrix_DCnMono.tab.gz", "size": 20325076, "publication_status": 0, "tag": "expression", "tags": [ "expression", "human", "homo-sapiens", "blood" ], "title": "Dendritic cells and monocytes in human blood", "references": [ "Villani, A. C., Satija, ... Jardine, L. (2017). Single-cell RNA-seq reveals new types of human blood dendritic cells, monocytes, and progenitors. Science, 356(6335)." ], "taxid": 9606, "num_of_genes": 26593 } ], [ [ "core", "DC_expMatrix_deeper.characterization.tab.gz" ], { "name": "DC_expMatrix_deeper.characterization.tab", "description": "A data set of 1244 cells from human blood samples. The included cell types are dendritic cells (DCs) with mutants overexpressed for marker genes CD141+, CD1C+, pathogenic cells driving blastic plasmacytoid dendritic cell neoplasm (BPDCN) from four donors, a double negative mutant CD11C+/CD141-/CD1C-, monocytes and plasmacytoid DCs, and cells FACS sorted for AXL6+/SIGLEC+ forming a new DC subplopulation.", "collection": "GEO", "version": "1.3", "year": 2017, "instances": 1244, "missing": 0, "variables": 26595, "source": "NCBI", "url": "https://datasets.biolab.si/DC_expMatrix_deeper.characterization.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "DC_expMatrix_deeper.characterization.tab.gz", "size": 18994431, "publication_status": 0, "tag": "expression", "tags": [ "expression", "human", "homo-sapiens", "blood" ], "title": "Dendritic cells and monocytes in human blood (deeper characterization)", "references": [ "Villani, A. C., Satija, ... Jardine, L. (2017). Single-cell RNA-seq reveals new types of human blood dendritic cells, monocytes, and progenitors. Science, 356(6335)." ], "taxid": 9606, "num_of_genes": 26593 } ], [ [ "core", "aml-1k.tab.gz" ], { "name": "aml-1k.tab", "description": "Gene expressions in bone marrow mononuclear cells from a patient with acute myeloid leukemia (AML) and two healthy donors used as controls. The data includes a sample of 1000 cells and 1000 genes with the highest dispersion. This is a sample data that comes with Loupe Cell Browser, and includes cells from three separate experiments with data sets published on 10x Genomics single-cell data sets page: AML027 Pre-transplant BMMCs, Frozen BMMCs (Healthy Control 1), and Frozen BMMCs (Healthy Control 2).", "collection": "10x Genomics", "version": "1.3", "year": 2017, "instances": 1000, "missing": 0, "variables": 1004, "source": "10x Genomics", "url": "https://datasets.biolab.si/aml-1k.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "aml-1k.tab.gz", "size": 353229, "publication_status": 0, "tag": "aml", "tags": [ "aml", "expression", "sample" ], "title": "Bone marrow mononuclear cells with AML (sample)", "references": [ "Zheng, G. X., Terry, J. M., ... Gregory, M. T. (2017). Massively parallel digital transcriptional profiling of single cells. Nature communications, 8, 14049." ], "taxid": 9606, "num_of_genes": 1000 } ], [ [ "core", "aml-8k.tab.gz" ], { "name": "aml-8k.tab", "description": "Gene expressions in bone marrow mononuclear cells from a patient with acute myeloid leukemia (AML) and two healthy donors used as controls. The data includes over 8000 cells and 1000 genes with the highest dispersion. This is a data that comes with Loupe Cell Browser, and includes cells from three separate experiments with data sets published on 10x Genomics single-cell data sets page: AML027 Pre-transplant BMMCs, Frozen BMMCs (Healthy Control 1), and Frozen BMMCs (Healthy Control 2).", "collection": "10x Genomics", "version": "1.3", "year": 2017, "instances": 8390, "missing": 0, "variables": 1004, "source": "10x Genomics", "url": "https://datasets.biolab.si/aml-8k.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "aml-8k.tab.gz", "size": 2859987, "publication_status": 0, "tag": "aml", "tags": [ "aml", "expression" ], "title": "Bone marrow mononuclear cells with AML", "references": [ "Zheng, G. X., Terry, J. M., ... Gregory, M. T. (2017). Massively parallel digital transcriptional profiling of single cells. Nature communications, 8, 14049." ], "taxid": 9606, "num_of_genes": 1000 } ], [ [ "core", "baron2016_pancreas_human.pkl.gz" ], { "name": "baron2016_pancreas_human.pkl", "description": "Single-cell RNA sequencing of pancreatic islets from 4 human donors", "collection": "GEO", "version": "3.0", "year": 2016, "instances": 8569, "missing": 0, "variables": 20130, "source": "GEO", "url": "https://datasets.biolab.si/baron2016_pancreas_human.pkl.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "baron2016_pancreas_human.pkl.gz", "size": 22304736, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "pancreas" ], "title": "Pancreas cells in human", "references": [ "Baron, M., Veres, A., Wolock, S. L., Faust, A. L., Gaujoux, R., Vetere, A., ... & Melton, D. A. (2016). A single-cell transcriptomic map of the human and mouse pancreas reveals inter-and intra-cell population structure. Cell systems, 3(4), 346-360." ], "taxid": 9606, "num_of_genes": 8569 } ], [ [ "core", "baron2016_pancreas_human_sample.tab.gz" ], { "name": "baron2016_pancreas_human_sample.tab", "description": "A sample of transcriptomes of major pancreatic cell types from one human donor.", "collection": "GEO", "version": "3.0", "year": 2016, "instances": 1631, "missing": 0, "variables": 5015, "source": "GEO", "url": "https://datasets.biolab.si/baron2016_pancreas_human_sample.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "baron2016_pancreas_human_sample.tab.gz", "size": 1376129, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "pancreas" ], "title": "Pancreas cells in human (sample)", "references": [ "Baron, M., Veres, A., Wolock, S. L., Faust, A. L., Gaujoux, R., Vetere, A., ... & Melton, D. A. (2016). A single-cell transcriptomic map of the human and mouse pancreas reveals inter-and intra-cell population structure. Cell systems, 3(4), 346-360." ], "taxid": 9606, "num_of_genes": 5010 } ], [ [ "core", "ccp_data_Tcells_normCounts.counts.all_genes.tab.gz" ], { "name": "ccp_data_Tcells_normCounts.counts.all_genes.tab", "description": "Na\u00efve CD4+ cells from spleens of IL-13eGFP Balb/c mice were negatively selected and differentiated toward TH2 in anti-CD3/CD28 coated plates. Gene expression was normalized with respect to ERCC spike-ins. Cell cycle stage of the cells is not known, but relevant marker genes can be used. The complete dataset contains expression of 38,293 genes.", "collection": "EBI", "version": "1.3", "year": 2014, "instances": 81, "missing": 0, "variables": 38293, "source": "ArrayExpress", "url": "https://datasets.biolab.si/ccp_data_Tcells_normCounts.counts.all_genes.tab.gz", "domain": "sc", "language": "English", "target": "none", "location": "ccp_data_Tcells_normCounts.counts.all_genes.tab.gz", "size": 4650642, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "tcell", "mus-musculus" ], "title": "Cell cycle in T-cells", "references": [ "Mahata, B., Zhang, X., ... Arlt, W. (2014). Single-cell RNA sequencing reveals T helper cells synthesizing steroids de novo to contribute to immune homeostasis. Cell reports, 7(4), 1130-1142.", "Buettner, F., Natarajan, K. N., ... Stegle, O. (2015). Computational analysis of cell-to-cell heterogeneity in single-cell RNA-sequencing data reveals hidden subpopulations of cells. Nature biotechnology, 33(2), 155-160." ], "taxid": 10090, "num_of_genes": 38293 } ], [ [ "core", "ccp_data_Tcells_normCounts.counts.cycle_genes.tab.gz" ], { "name": "ccp_data_Tcells_normCounts.counts.cycle_genes.tab", "description": "Na\u00efve CD4+ cells from spleens of IL-13eGFP Balb/c mice were negatively selected and differentiated toward TH2 in anti-CD3/CD28 coated plates. Gene expression was normalized with respect to ERCC spike-ins. Cell cycle stage of the cells is not known, but relevant marker genes can be used. The reduced data set contains expression of 553 genes related to cell cycle based on Gene Ontology (GO) terms.", "collection": "EBI", "version": "1.3", "year": 2014, "instances": 81, "missing": 0, "variables": 553, "source": "ArrayExpress", "url": "https://datasets.biolab.si/ccp_data_Tcells_normCounts.counts.cycle_genes.tab.gz", "domain": "sc", "language": "English", "target": "none", "location": "ccp_data_Tcells_normCounts.counts.cycle_genes.tab.gz", "size": 230943, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "tcell", "mus-musculus" ], "title": "Cell cycle in T-cells (cell cycle genes)", "references": [ "Mahata, B., Zhang, X., ... Arlt, W. (2014). Single-cell RNA sequencing reveals T helper cells synthesizing steroids de novo to contribute to immune homeostasis. Cell reports, 7(4), 1130-1142.", "Buettner, F., Natarajan, K. N., ... Stegle, O. (2015). Computational analysis of cell-to-cell heterogeneity in single-cell RNA-sequencing data reveals hidden subpopulations of cells. Nature biotechnology, 33(2), 155-160." ], "taxid": 10090, "num_of_genes": 553 } ], [ [ "core", "ccp_data_liver.counts.all_genes.tab.gz" ], { "name": "ccp_data_liver.counts.all_genes.tab", "description": "Five liver cells, sequenced using the Smart-seq protocol. Since most liver cells do not proliferate, they are expected to be in G1 cycle phase. The instances in the data set are therefore not labelled (with cell cycle information). The complete dataset contains expression of 20,683 genes.", "collection": "GEO", "version": "1.3", "year": 2014, "instances": 5, "missing": 0, "variables": 20683, "source": "NCBI", "url": "https://datasets.biolab.si/ccp_data_liver.counts.all_genes.tab.gz", "domain": "sc", "language": "English", "target": "none", "location": "ccp_data_liver.counts.all_genes.tab.gz", "size": 197362, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "liver", "mus-musculus" ], "title": "Cell cycle in mouse liver", "references": [ "Deng, Q., Ramsk\u00f6ld, D., Reinius, B., Sandberg, R. (2014). Single-cell RNA-seq reveals dynamic, random monoallelic gene expression in mammalian cells. Science, 343(6167), 193-196.", "Scialdone, A., Natarajan, K. N., Saraiva, L. R., ... Buettner, F. (2015). Computational assignment of cell-cycle stage from single-cell transcriptome data. Methods, 85, 54-61." ], "taxid": 10090, "num_of_genes": 20683 } ], [ [ "core", "ccp_data_liver.counts.cycle_genes.tab.gz" ], { "name": "ccp_data_liver.counts.cycle_genes.tab", "description": "Five liver cells, sequenced using the Smart-seq protocol. Since most liver cells do not proliferate, they are expected to be in G1 cycle phase. The instances in the data set are therefore not labelled (with cell cycle information). The reduced data set contains expression of 537 genes related to cell cycle based on Gene Ontology (GO) terms.", "collection": "GEO", "version": "1.3", "year": 2014, "instances": 5, "missing": 0, "variables": 537, "source": "NCBI", "url": "https://datasets.biolab.si/ccp_data_liver.counts.cycle_genes.tab.gz", "domain": "sc", "language": "English", "target": "none", "location": "ccp_data_liver.counts.cycle_genes.tab.gz", "size": 5841, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "liver", "mus-musculus" ], "title": "Cell cycle in mouse liver (cell cycle genes)", "references": [ "Deng, Q., Ramsk\u00f6ld, D., Reinius, B., Sandberg, R. (2014). Single-cell RNA-seq reveals dynamic, random monoallelic gene expression in mammalian cells. Science, 343(6167), 193-196.", "Scialdone, A., Natarajan, K. N., Saraiva, L. R., ... Buettner, F. (2015). Computational assignment of cell-cycle stage from single-cell transcriptome data. Methods, 85, 54-61." ], "taxid": 10090, "num_of_genes": 537 } ], [ [ "core", "ccp_data_mESCbulk.counts.all_genes.tab.gz" ], { "name": "ccp_data_mESCbulk.counts.all_genes.tab", "description": "Mouse embryonic stem cells (mESCs) were FACS sorted for cell cycle stages (G1, S and G2M). Approximately 150,000\u2013300,000 cells from an asynchronous population and from each cell cycle fractions (G1, S and G2M) were used for bulk mRNA sequencing, with libraries being generated using the Illumina TruSeq Stranded RNA Sample preparation kit. All libraries were prepared and sequenced using the Wellcome Trust Sanger Institute sample preparation pipeline. Sequencing quality control and data quality checks were performed by the Sanger Sequencing facility. The complete dataset contains expression of 38,293 genes.", "collection": "EBI", "version": "1.3", "year": 2015, "instances": 4, "missing": 0, "variables": 38294, "source": "ArrayExpress", "url": "https://datasets.biolab.si/ccp_data_mESCbulk.counts.all_genes.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "ccp_data_mESCbulk.counts.all_genes.tab.gz", "size": 383715, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "mesc", "mus-musculus", "rna-seq" ], "title": "Cell cycle in mESC (bulk RNA-seq)", "references": [ "Scialdone, A., Natarajan, K. N., Saraiva, L. R., ... Buettner, F. (2015). Computational assignment of cell-cycle stage from single-cell transcriptome data. Methods, 85, 54-61." ], "taxid": 10090, "num_of_genes": 38293 } ], [ [ "core", "ccp_data_mESCbulk.counts.cycle_genes.tab.gz" ], { "name": "ccp_data_mESCbulk.counts.cycle_genes.tab", "description": "Mouse embryonic stem cells (mESCs) were FACS sorted for cell cycle stages (G1, S and G2M). Approximately 150,000\u2013300,000 cells from an asynchronous population and from each cell cycle fractions (G1, S and G2M) were used for bulk mRNA sequencing, with libraries being generated using the Illumina TruSeq Stranded RNA Sample preparation kit. All libraries were prepared and sequenced using the Wellcome Trust Sanger Institute sample preparation pipeline. Sequencing quality control and data quality checks were performed by the Sanger Sequencing facility. The reduced data set contains expression of 553 genes related to cell cycle based on Gene Ontology (GO) terms.", "collection": "EBI", "version": "1.3", "year": 2015, "instances": 4, "missing": 0, "variables": 554, "source": "ArrayExpress", "url": "https://datasets.biolab.si/ccp_data_mESCbulk.counts.cycle_genes.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "ccp_data_mESCbulk.counts.cycle_genes.tab.gz", "size": 9879, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "mesc", "mus-musculus", "rna-seq" ], "title": "Cell cycle in mESC (bulk RNA-seq, cell cycle genes)", "references": [ "Scialdone, A., Natarajan, K. N., Saraiva, L. R., ... Buettner, F. (2015). Computational assignment of cell-cycle stage from single-cell transcriptome data. Methods, 85, 54-61." ], "taxid": 10090, "num_of_genes": 553 } ], [ [ "core", "ccp_normCountsBuettnerEtAl.counts.all_genes.tab.gz" ], { "name": "ccp_normCountsBuettnerEtAl.counts.all_genes.tab", "description": "A single-cell RNA-seq dataset comprised of 182 mouse embryonic stem cells (mESCs) with known cell-cycle phase. Cells were sorted using FACS for three different cell-cycle phases. This resulted in a filtered set of 59 cells in G1 phase, 58 cells in S phase and 65 cells in G2M phase. Next, single-cell RNA-seq was performed using the C1 Single Cell Auto Prep System (Fluidigm). The raw read counts were normalised using two different size factors derived from endogenous genes and ERCC spike-ins. The complete dataset contains expression of 38,293 genes.", "collection": "EBI", "version": "1.3", "year": 2015, "instances": 182, "missing": 0, "variables": 38294, "source": "ArrayExpress", "url": "https://datasets.biolab.si/ccp_normCountsBuettnerEtAl.counts.all_genes.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "ccp_normCountsBuettnerEtAl.counts.all_genes.tab.gz", "size": 4110383, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "mesc", "mus-musculus" ], "title": "Cell cycle in mESC (Fluidigm)", "references": [ "Buettner, F., Natarajan, K. N., ... Stegle, O. (2015). Computational analysis of cell-to-cell heterogeneity in single-cell RNA-sequencing data reveals hidden subpopulations of cells. Nature biotechnology, 33(2), 155-160.", "Scialdone, A., Natarajan, K. N., Saraiva, L. R., ... Buettner, F. (2015). Computational assignment of cell-cycle stage from single-cell transcriptome data. Methods, 85, 54-61." ], "taxid": 10090, "num_of_genes": 38293 } ], [ [ "core", "ccp_normCountsBuettnerEtAl.counts.cycle_genes.tab.gz" ], { "name": "ccp_normCountsBuettnerEtAl.counts.cycle_genes.tab", "description": "A single-cell RNA-seq dataset comprised of 182 mouse embryonic stem cells (mESCs) with known cell-cycle phase. Cells were sorted using FACS for three different cell-cycle phases. This resulted in a filtered set of 59 cells in G1 phase, 58 cells in S phase and 65 cells in G2M phase. Next, single-cell RNA-seq was performed using the C1 Single Cell Auto Prep System (Fluidigm). The raw read counts were normalised using two different size factors derived from endogenous genes and ERCC spike-ins. The reduced data set contains expression of 563 genes related to cell cycle based on Gene Ontology (GO) terms.", "collection": "EBI", "version": "1.3", "year": 2015, "instances": 182, "missing": 0, "variables": 564, "source": "ArrayExpress", "url": "https://datasets.biolab.si/ccp_normCountsBuettnerEtAl.counts.cycle_genes.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "ccp_normCountsBuettnerEtAl.counts.cycle_genes.tab.gz", "size": 150014, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "mesc", "mus-musculus" ], "title": "Cell cycle in mESC (Fluidigm, cell cycle genes)", "references": [ "Buettner, F., Natarajan, K. N., ... Stegle, O. (2015). Computational analysis of cell-to-cell heterogeneity in single-cell RNA-sequencing data reveals hidden subpopulations of cells. Nature biotechnology, 33(2), 155-160.", "Scialdone, A., Natarajan, K. N., Saraiva, L. R., ... Buettner, F. (2015). Computational assignment of cell-cycle stage from single-cell transcriptome data. Methods, 85, 54-61." ], "taxid": 10090, "num_of_genes": 563 } ], [ [ "core", "ccp_normCounts_mESCquartz.counts.all_genes.tab.gz" ], { "name": "ccp_normCounts_mESCquartz.counts.all_genes.tab", "description": "The mouse embryonic stem cells (mESCs) were FACS sorted into G1, S and G2M phases. A total of 35 cells (seven S, eight G2M and 20 G1 cells) were sequenced using the Quartz-seq protocol and gene expression was normalised to FPKM values. The amount of technical noise expected for genes with variable levels of expression was estimated using a log-linear fit between the expression mean and the squared coefficient of variation between cells. The complete dataset contains expression of 36,807 genes.", "collection": "GEO", "version": "1.3", "year": 2013, "instances": 35, "missing": 0, "variables": 36808, "source": "NCBI", "url": "https://datasets.biolab.si/ccp_normCounts_mESCquartz.counts.all_genes.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "ccp_normCounts_mESCquartz.counts.all_genes.tab.gz", "size": 774815, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "mesc", "mus-musculus" ], "title": "Cell cycle in mESC (QuartzSeq)", "references": [ "Sasagawa, Y., Nikaido, I., ..., Ueda, H. R. (2013). Quartz-Seq: a highly reproducible and sensitive single-cell RNA sequencing method, reveals non-genetic gene-expression heterogeneity. Genome biology, 14(4), 3097.", "Scialdone, A., Natarajan, K. N., Saraiva, L. R., ... Buettner, F. (2015). Computational assignment of cell-cycle stage from single-cell transcriptome data. Methods, 85, 54-61." ], "taxid": 10090, "num_of_genes": 36807 } ], [ [ "core", "ccp_normCounts_mESCquartz.counts.cycle_genes.tab.gz" ], { "name": "ccp_normCounts_mESCquartz.counts.cycle_genes.tab", "description": "The mESCs were FACS sorted into G1, S and G2M phases. A total of 35 cells (seven S, eight G2M and 20 G1 cells) were sequenced using the Quartz-seq protocol and gene expression was normalised to FPKM values. The amount of technical noise expected for genes with variable levels of expression was estimated using a log-linear fit between the expression mean and the squared coefficient of variation between cells. The reduced data set contains expression of 561 genes related to cell cycle based on Gene Ontology (GO) terms.", "collection": "GEO", "version": "1.3", "year": 2013, "instances": 35, "missing": 0, "variables": 562, "source": "NCBI", "url": "https://datasets.biolab.si/ccp_normCounts_mESCquartz.counts.cycle_genes.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "ccp_normCounts_mESCquartz.counts.cycle_genes.tab.gz", "size": 24516, "publication_status": 0, "tag": "mouse", "tags": [ "mouse", "expression", "mesc", "mus-musculus" ], "title": "Cell cycle in mESC (QuartzSeq, cell cycle genes)", "references": [ "Sasagawa, Y., Nikaido, I., ..., Ueda, H. R. (2013). Quartz-Seq: a highly reproducible and sensitive single-cell RNA sequencing method, reveals non-genetic gene-expression heterogeneity. Genome biology, 14(4), 3097.", "Scialdone, A., Natarajan, K. N., Saraiva, L. R., ... Buettner, F. (2015). Computational assignment of cell-cycle stage from single-cell transcriptome data. Methods, 85, 54-61." ], "taxid": 10090, "num_of_genes": 561 } ], [ [ "core", "cdp_expression_macosko.tab.gz" ], { "name": "cdp_expression_macosko.tab", "description": "DropSeq analysis of more than 6,000 mouse retinal cells with expression levels of more than 6,800 genes expressed in at least 5% of the cells. The cells are labelled with corresponding bipolar cell (BC) cluster identified by the original study.", "collection": "GEO", "version": "2.3", "year": 2015, "instances": 6243, "missing": 1, "variables": 6862, "source": "NCBI", "url": "https://datasets.biolab.si/cdp_expression_macosko.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "cdp_expression_macosko.tab.gz", "size": 7786356, "publication_status": 0, "tag": "expression", "tags": [ "expression", "mouse", "mus-musculus", "neuron", "drop-seq" ], "title": "Mouse retinal bipolar neurons (DropSeq)", "references": [ "Macosko, E. Z., Basu, A., Satija, R., ... Trombetta, J. J. (2015). Highly parallel genome-wide expression profiling of individual cells using nanoliter droplets. Cell, 161(5), 1202-1214." ], "taxid": 10090, "num_of_genes": 6860 } ], [ [ "core", "cdp_expression_shekhar.tab.gz" ], { "name": "cdp_expression_shekhar.tab", "description": "The dataset contains a heterogeneous class of neurons, mouse retinal bipolar cells (BCs). Gene expression was measured with the DropSeq protocol. More than 4,900 genes expressed in at least 5% of the cells are included. The 12,606 cells are classified into 13 subtypes based on morphology and position.", "collection": "GEO", "version": "1.3", "year": 2016, "instances": 12606, "missing": 0, "variables": 4982, "source": "NCBI", "url": "https://datasets.biolab.si/cdp_expression_shekhar.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "cdp_expression_shekhar.tab.gz", "size": 11191059, "publication_status": 0, "tag": "expression", "tags": [ "expression", "mouse", "mus-musculus", "neuron", "drop-seq" ], "title": "Mouse retinal bipolar neurons (DropSeq, large)", "references": [ "Shekhar, K., Lapan, S. W., ... McCarroll, S. A. (2016). Comprehensive classification of retinal bipolar neurons by single-cell transcriptomics. Cell, 166(5), 1308-1323." ], "taxid": 10090, "num_of_genes": 4980 } ], [ [ "core", "dm_proj_neurons_li2017.pkl.gz" ], { "name": "dm_proj_neurons_li2017.pkl", "description": "The data set contains 1842 projection neurons from Drosophila Melanogaster. The brains with mCD8GFP-labeled cells using specific GAL4 drivers were manually dissected, and two optical lobes were removed.", "collection": "EBI", "version": "1.0", "year": 2017, "instances": 1842, "missing": 0, "variables": 13920, "source": "EBI", "url": "https://datasets.biolab.si/dm_proj_neurons_li2017.pkl.gz", "domain": "sc", "language": "English", "target": "none", "location": "dm_proj_neurons_li2017.pkl.gz", "size": 24587348, "publication_status": 0, "tag": "drosophila-melanogaster", "tags": [ "drosophila-melanogaster", "expression", "differentiation" ], "title": "Drosophila Olfactory Projection Neuron Subtypes", "references": [ "Li, Hongjie, et al. Classifying Drosophila olfactory projection neuron subtypes by single-cell RNA sequencing. Cell 171.5 (2017): 1206-1220." ], "taxid": 7227, "num_of_genes": 13898 } ], [ [ "core", "galen2019_AML_bone_marrow_day0.pkl.gz" ], { "name": "galen2019_AML_bone_marrow_day0.pkl", "description": "Bone marrow aspirate from AML patient before chemotherapy", "collection": "GEO", "version": "1.0", "year": 2018, "instances": 2328, "missing": 0, "variables": 27701, "source": "GEO", "url": "https://datasets.biolab.si/galen2019_AML_bone_marrow_day0.pkl.gz", "domain": "sc", "language": "English", "target": "none", "location": "galen2019_AML_bone_marrow_day0.pkl.gz", "size": 6275273, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "AML", "bone marrow" ], "title": "AML patient bone marrow day 0", "references": [ "van Galen, P., Hovestadt, V., Wadsworth II, M. H., Hughes, T. K., Griffin, G. K., Battaglia, S., ... & Pinkus, G. S. (2019). Single-Cell RNA-Seq Reveals AML Hierarchies Relevant to Disease Progression and Immunity. Cell, 176(6), 1265-1281." ], "taxid": 9606, "num_of_genes": 27899 } ], [ [ "core", "galen2019_AML_bone_marrow_day15.pkl.gz" ], { "name": "galen2019_AML_bone_marrow_day15.pkl", "description": "Bone marrow aspirate from AML patient 15 days after first undergoing chemotherapy...", "collection": "GEO", "version": "1.2", "year": 2018, "instances": 1203, "missing": 0, "variables": 27701, "source": "GEO", "url": "https://datasets.biolab.si/galen2019_AML_bone_marrow_day15.pkl.gz", "domain": "sc", "language": "English", "target": "none", "location": "galen2019_AML_bone_marrow_day15.pkl.gz", "size": 3863633, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "AML", "bone marrow" ], "title": "AML patient bone marrow day 15", "references": [ "van Galen, P., Hovestadt, V., Wadsworth II, M. H., Hughes, T. K., Griffin, G. K., Battaglia, S., ... & Pinkus, G. S. (2019). Single-Cell RNA-Seq Reveals AML Hierarchies Relevant to Disease Progression and Immunity. Cell, 176(6), 1265-1281." ], "taxid": 9606, "num_of_genes": 27899 } ], [ [ "core", "galen2019_AML_bone_marrow_day31.pkl.gz" ], { "name": "galen2019_AML_bone_marrow_day31.pkl", "description": "Bone marrow aspirate from AML patient 31 days after first undergoing chemotherapy", "collection": "GEO", "version": "1.2", "year": 2018, "instances": 1203, "missing": 0, "variables": 27701, "source": "GEO", "url": "https://datasets.biolab.si/galen2019_AML_bone_marrow_day31.pkl.gz", "domain": "sc", "language": "English", "target": "none", "location": "galen2019_AML_bone_marrow_day31.pkl.gz", "size": 3863633, "publication_status": 0, "tag": "AML", "tags": [ "AML", "bone marrow", "expression", "human" ], "title": "AML patient bone marrow day 31", "references": [ "van Galen, P., Hovestadt, V., Wadsworth II, M. H., Hughes, T. K., Griffin, G. K., Battaglia, S., ... & Pinkus, G. S. (2019). Single-Cell RNA-Seq Reveals AML Hierarchies Relevant to Disease Progression and Immunity. Cell, 176(6), 1265-1281." ], "taxid": 9606, "num_of_genes": 27899 } ], [ [ "core", "galen2019_healthy_bone_marrow.pkl.gz" ], { "name": "galen2019_healthy_bone_marrow.pkl", "description": "Single cell profile of a bone marrow aspirate from a healthy donor containing 3739 cells", "collection": "GEO", "version": "1.0", "year": 2018, "instances": 3737, "missing": 0, "variables": 27701, "source": "GEO", "url": "https://datasets.biolab.si/galen2019_healthy_bone_marrow.pkl.gz", "domain": "sc", "language": "English", "target": "none", "location": "galen2019_healthy_bone_marrow.pkl.gz", "size": 10082836, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "bone marrow" ], "title": "Healthy human bone marrow", "references": [ "van Galen, P., Hovestadt, V., Wadsworth II, M. H., Hughes, T. K., Griffin, G. K., Battaglia, S., ... & Pinkus, G. S. (2019). Single-Cell RNA-Seq Reveals AML Hierarchies Relevant to Disease Progression and Immunity. Cell, 176(6), 1265-1281." ], "taxid": 9606, "num_of_genes": 27699 } ], [ [ "core", "miller2019_chronically_infected_CD8.pkl.gz" ], { "name": "miller2019_chronically_infected_CD8.pkl", "description": "Expression profile obtained by high throughput sequencing of distinct populations of progenitor exhausted and terminally exhausted CD8+ T-cells that occur in chronic LCMV Clone 13 infection in mouse", "collection": "GEO", "version": "1.0", "year": 2018, "instances": 9197, "missing": 0, "variables": 22193, "source": "GEO", "url": "https://datasets.biolab.si/nestorawa_forcellcycle.pkl.gz", "domain": "sc", "language": "English", "target": "none", "location": "nestorawa_forcellcycle.pkl.gz", "size": 17564953, "publication_status": 0, "tag": "mus-musculus", "tags": [ "mus-musculus", "expression", "HPSC", "cell-cycle", "differentiation" ], "title": "Mouse haematopoietic stem and progenitor cell differentiation", "references": [ "Nestorowa, S., Hamey, F. K., Sala, B. P., Diamanti, E., Shepherd, M., Laurenti, E., ... & G\\u00f6ttgens, B. (2016). A single cell resolution map of mouse haematopoietic stem and progenitor cell differentiation. Blood, blood-2016." ], "taxid": 10090, "num_of_genes": 23929 } ], [ [ "core", "pbmc_kang2018_raw_control.pkl.gz" ], { "name": "pbmc_kang2018_raw_control.pkl", "description": "Multiplexed dscRNA-seq was used to characterize the cell-type specificity and inter-individual variability of response to IFN-\u03b2, a potent cytokine that induces genome-scale changes in the transcriptional profiles of immune cells. From each of eight lupus patients, PBMCs were activated with recombinant IFN-\u03b2 or left untreated for 6 h, a time point previously found to maximize the expression of interferon-sensitive genes in dendritic cells and T cells16,17. Two pools, IFN-\u03b2-treated and control, were prepared with the same number of cells from each individual and loaded onto the 10\u00d7 Chromium instrument.", "collection": "GEO", "version": "2.0", "year": 2018, "instances": 13019, "missing": 0, "variables": 35637, "source": "GEO", "url": "https://datasets.biolab.si/pbmc_kang2018_raw_control.pkl.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "pbmc_kang2018_raw_control.pkl.gz", "size": 19504172, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "pbmc", "immune-system" ], "title": "Stimulated and resting immune cells (control)", "references": [ "Kang, H. M., Subramaniam, M., Targ, S., Nguyen, M., Maliskova, L., McCarthy, E., ... & Gate, R. E. (2018). Multiplexed droplet single-cell RNA-sequencing using natural genetic variation. Nature biotechnology, 36(1), 89.", "Butler, A., Hoffman, P., Smibert, P., Papalexi, E., & Satija, R. (2018). Integrating single-cell transcriptomic data across different conditions, technologies, and species. Nature biotechnology, 36(5), 411." ], "taxid": 9606, "num_of_genes": 35635 } ], [ [ "core", "pbmc_kang2018_raw_stimulated.pkl.gz" ], { "name": "pbmc_kang2018_raw_stimulated.pkl", "description": "Multiplexed dscRNA-seq was used to characterize the cell-type specificity and inter-individual variability of response to IFN-\u03b2, a potent cytokine that induces genome-scale changes in the transcriptional profiles of immune cells. From each of eight lupus patients, PBMCs were activated with recombinant IFN-\u03b2 or left untreated for 6 h, a time point previously found to maximize the expression of interferon-sensitive genes in dendritic cells and T cells16,17. Two pools, IFN-\u03b2-treated and control, were prepared with the same number of cells from each individual and loaded onto the 10\u00d7 Chromium instrument.", "collection": "GEO", "version": "2.0", "year": 2018, "instances": 12875, "missing": 0, "variables": 35637, "source": "GEO", "url": "https://datasets.biolab.si/pbmc_kang2018_raw_stimulated.pkl.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "pbmc_kang2018_raw_stimulated.pkl.gz", "size": 19932029, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "pbmc", "immune-system" ], "title": "Stimulated and resting immune cells (stimulated)", "references": [ "Kang, H. M., Subramaniam, M., Targ, S., Nguyen, M., Maliskova, L., McCarthy, E., ... & Gate, R. E. (2018). Multiplexed droplet single-cell RNA-sequencing using natural genetic variation. Nature biotechnology, 36(1), 89.", "Butler, A., Hoffman, P., Smibert, P., Papalexi, E., & Satija, R. (2018). Integrating single-cell transcriptomic data across different conditions, technologies, and species. Nature biotechnology, 36(5), 411." ], "taxid": 9606, "num_of_genes": 35635 } ], [ [ "core", "pbmc_kang2018_sample.tab.gz" ], { "name": "pbmc_kang2018_sample.tab", "description": "A preprocessed sample of Kang et al. (2018) data containing 1,000 controls and stimulated cells and 1,500 highly variable genes. Expression was CPM-normalized, log-transformed, and z-standardized. In the original study, the multiplexed dscRNA-seq was used to characterize the cell-type specificity and inter-individual variability of response to IFN-\u03b2, a potent cytokine that induces genome-scale changes in the transcriptional profiles of immune cells. From each of eight lupus patients, PBMCs were activated with recombinant IFN-\u03b2 or left untreated for 6 h, a time point previously found to maximize the expression of interferon-sensitive genes in dendritic cells and T cells16,17. Two pools, IFN-\u03b2-treated and control, were prepared with the same number of cells from each individual and loaded onto the 10\u00d7 Chromium instrument.", "collection": "GEO", "version": "1.0", "year": 2018, "instances": 1000, "missing": 0, "variables": 1502, "source": "GEO", "url": "https://datasets.biolab.si/pbmc_kang2018_sample.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "pbmc_kang2018_sample.tab.gz", "size": 4000592, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "pbmc", "immune-system" ], "title": "Stimulated and resting immune cells (1000 cells)", "references": [ "Kang, H. M., Subramaniam, M., Targ, S., Nguyen, M., Maliskova, L., McCarthy, E., ... & Gate, R. E. (2018). Multiplexed droplet single-cell RNA-sequencing using natural genetic variation. Nature biotechnology, 36(1), 89.", "Butler, A., Hoffman, P., Smibert, P., Papalexi, E., & Satija, R. (2018). Integrating single-cell transcriptomic data across different conditions, technologies, and species. Nature biotechnology, 36(5), 411." ], "taxid": 9606, "num_of_genes": 1500 } ], [ [ "core", "xin2016_pancreas_human.tab.gz" ], { "name": "xin2016_pancreas_human.tab", "description": "Data gathered using single-cell RNA sequencing to determine the transcriptomes of 1,492 human pancreatic \u03b1-, \u03b2-, \u03b4- and PP cells from non-diabetic and type 2 diabetes organ donors. 245 genes with disturbed expression in type 2 diabetes can be idenfitied from it.", "collection": "GEO", "version": "1.0", "year": 2016, "instances": 1492, "missing": 0, "variables": 35900, "source": "GEO", "url": "https://datasets.biolab.si/xin2016_pancreas_human.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "xin2016_pancreas_human.tab.gz", "size": 74519264, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "pancreas", "diabetes" ], "title": "Pancreas cells in human (type 2 diabetis)", "references": [ "Xin, Y, Kim, J., Okamoto, H., Ni, M., Wei, Y., Adler, C., J. Murphy, A., D. Yancopoulos, Lin, C., Gromada, J. (2016). RNA Sequencing of Single Human Islet Cells Reveals Type 2 Diabetes Genes. Cell Metabolism, 24 (4), 608-615." ], "taxid": 9606, "num_of_genes": 35899 } ], [ [ "core", "xin2016_pancreas_human_sample.tab.gz" ], { "name": "xin2016_pancreas_human_sample.tab", "description": "A sample of 500 single cells gathered using single-cell RNA sequencing to determine the transcriptomes of human pancreatic \u03b1-, \u03b2-, \u03b4- and PP cells from non-diabetic and type 2 diabetes organ donors. 245 genes with disturbed expression in type 2 diabetes can be idenfitied from it.", "collection": "GEO", "version": "3.0", "year": 2016, "instances": 500, "missing": 0, "variables": 4648, "source": "GEO", "url": "https://datasets.biolab.si/xin2016_pancreas_human_sample.tab.gz", "domain": "sc", "language": "English", "target": "categorical", "location": "xin2016_pancreas_human_sample.tab.gz", "size": 6046346, "publication_status": 0, "tag": "human", "tags": [ "human", "expression", "pancreas", "diabetes" ], "title": "Pancreas cells in human (type 2 diabetes) (sample)", "references": [ "Xin, Y, Kim, J., Okamoto, H., Ni, M., Wei, Y., Adler, C., J. Murphy, A., D. Yancopoulos, Lin, C., Gromada, J. (2016). RNA Sequencing of Single Human Islet Cells Reveals Type 2 Diabetes Genes. Cell Metabolism, 24 (4), 608-615." ], "taxid": 9606, "num_of_genes": 4647 } ], [ [ "core", "BBC3.tab" ], { "name": "BBC3", "description": "A subset of BBC news articles, containing categories business, entertainment, and sport from 2004-2005. These datasets are made available for non-commercial and research purposes only. All rights, including copyright, in the content of the original articles are owned by the BBC.", "collection": "", "version": "2.1", "year": 2006, "instances": 1407, "missing": 0, "variables": 3, "source": "ML Resources", "url": "https://datasets.biolab.si/BBC3.tab", "domain": null, "language": "English", "target": "categorical", "location": "BBC3.tab", "size": 2754486, "publication_status": 0, "tag": "text", "tags": [ "text", "classification", "news" ], "title": "BBC3", "references": [ "D. Greene and P. Cunningham. [Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering](http://mlg.ucd.ie/files/publications/greene06icml.pdf), Proc. ICML 2006." ] } ], [ [ "core", "GDS360.tab" ], { "name": "GDS360", "description": "Breast cancer core biopsies taken from patients found to be resistant (greater than 25% residual tumor volume) or sensitive (less than 25% residual tumor volume) to docetaxel treatment.", "collection": "NCBI", "version": "1.0", "year": 2006, "instances": 24, "missing": 0, "variables": 9486, "source": "NCBI", "url": "https://datasets.biolab.si/GDS360.tab", "domain": null, "language": "English", "target": "categorical", "location": "GDS360.tab", "size": 1870893, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Breast Cancer and Docetaxel Treatment", "references": [ "Chang JC, Wooten EC, Tsimelzon A, Hilsenbeck SG et al. (2005) Patterns of resistance and incomplete response to docetaxel by gene expression profiling in breast cancer patients. J Clin Oncol, 23(6): 1169-77." ] } ], [ [ "core", "GDS3713-small.tab" ], { "name": "GDS3713-small", "description": "Gene expression data from peripheral circulating B cells from smoking (39 samples) and non-smoking healthy US white females (40 samples). Only 3,000 randomly selected genes (features) were retained from original data set to reduce the data size.", "collection": "GEO", "version": "1.0", "year": 2009, "instances": 79, "missing": 0, "variables": 3001, "source": "Gene Expression Omnibus GDS3713", "url": "https://datasets.biolab.si/GDS3713-small.tab", "domain": null, "language": "English", "target": "categorical", "location": "GDS3713-small.tab", "size": 1887679, "publication_status": 0, "tag": "genomics", "tags": [ "genomics" ], "title": "Smoking effect on B lymphocytes" } ], [ [ "core", "HDI.tab" ], { "name": "HDI", "description": "The Human Development Index (HDI) is a summary measure of average achievement in key dimensions of human development: a long and healthy life, knowledge, and decent living. The data includes HDI as reported in 2015 and selected related socioeconomic from countries worldwide. We have removed HDI-related variables and countries with a missing HDI score.", "collection": "UNDP Human Development Reports", "version": "3.1", "year": 2015, "instances": 188, "missing": 1, "variables": 54, "source": "Human Development Data (1990-2015)", "url": "https://datasets.biolab.si/HDI.tab", "domain": null, "language": "English", "target": "none", "location": "HDI.tab", "size": 46299, "publication_status": 0, "tag": "economy", "tags": [ "economy", "geo" ], "title": "HDI" } ], [ [ "core", "ParlaMint-1000.tab" ], { "name": "ParlaMint-1000", "description": "A sample of 1000 parliamentary debates from the ParlaMint-GB 2.1 corpus. ParlaMint-GB features speeches from both houses of the parliament for years 2019 and 2020. It comes with rich metadata, such as speaker name, party affiliation, and speaker role. A column with GPT-generated summaries is added.", "collection": "", "version": "2.2", "year": 2021, "instances": 1000, "missing": 0, "variables": 18, "source": "https://github.com/clarin-eric/ParlaMint/blob/main/Samples/ParlaMint-GB/ParlaMint-GB.ana.xml", "url": "https://datasets.biolab.si/ParlaMint-1000.tab", "domain": null, "language": "English", "target": "categorical", "location": "ParlaMint-1000.tab", "size": 1795759, "publication_status": 0, "tag": "text", "tags": [ "text", "classification", "time", "politics" ], "title": "ParlaMint", "references": [ "Toma\u017e Erjavec et al. The ParlaMint corpora of parliamentary proceedings. Language Resources and Evaluation, 2022. https://doi.org/10.1007/s10579-021-09574-0" ] } ], [ [ "core", "SentiNews-SI.tab" ], { "name": "SentiNews-SI", "description": "A subset of 2000 documents from AutoSentiNews data set. The original corpus contains 256,567 documents from the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24, which contain political, business, economic and financial content. The subset covers only Dnevnik and 24ur. The news articles are annotated as positive, negative or neutral at the document level.", "collection": "CLARIN", "version": "1.0", "year": 2017, "instances": 2000, "missing": 0, "variables": 8, "source": "CLARIN Repository", "url": "https://datasets.biolab.si/SentiNews-SI.tab", "domain": null, "language": "English", "target": "categorical", "location": "SentiNews-SI.tab", "size": 5203086, "publication_status": 0, "tag": "text", "tags": [ "text", "sentiment" ], "title": "SentiNews", "references": [ "Bu\u010dar, Jo\u017ee, 2017, Automatically sentiment annotated Slovenian news corpus AutoSentiNews 1.0, Slovenian language resource repository CLARIN.SI, http://hdl.handle.net/11356/1109." ] } ], [ [ "core", "TKI-resistance-spectroscopy.tab" ], { "name": "TKI-resistance-spectroscopy", "description": "An infrared spectroscopy dataset on Tyrosine Kinase Inhibitor resistance. Chronic Myeloid Leukemia (CML) is a common blood disease caused by a chromosomal translocation creating a chimeric tyrosine kinase (BCR-ABL) that is permanently active and causes uncontrolled proliferation of blood progenitor cells. CML crises can be very effectively treated by using drugs called Tyrosine Kinase Inhibitors (TKI), but TKI resistance appears in some patients that prompted the development of new generation TKIs. However, the T315I mutation renders BCR-ABL kinases resistant to all generations of TKI (resistance to imatinib, dasatinib or nilotinib). The dataset is from the Sandt et al. (2018) study where murine embryonic stem cells were used to evaluate the T315I mutation in a stem cell context, mimicking the blood progenitor context. The question was whether we could separate the cells expressing the mutated BCR-ABL from the cells expressing the wild type BCR-ABL. Murine embryonic GS2 cells (mES GS2) were transduced either with MIGR, MIGR-BCR-ABL or MIGR-T315I vectors and grown on low-e slides. The dataset contains spectra of individual mES GS2 cells measured by synchrotron radiation FTIR microspectroscopy in transflection mode at 12x12 \u00b5m\u00b2 spatial resolution. Data was collected and compiled by Dr. Christophe Sandt.", "collection": "spectral", "version": "1.0", "year": 2018, "instances": 280, "missing": 0, "variables": 468, "source": "", "url": "https://datasets.biolab.si/TKI-resistance-spectroscopy.tab", "domain": null, "language": "English", "target": "categorical", "location": "TKI-resistance-spectroscopy.tab", "size": 1310515, "publication_status": 0, "tag": "spectral", "tags": [ "spectral" ], "title": "TKI resistance", "references": [ "Sandt C, Feraud O, Bonnet ML, Desterke C, Khedhir R, Flamant S, Bailey CG, Rasko JE, Dumas P, Bennaceur-Griscelli A, Turhan AG. Direct and rapid identification of T315I-Mutated BCR-ABL expressing leukemic cells using infrared microspectroscopy. Biochemical and biophysical research communications. 2018. 503(3):1861-7." ] } ], [ [ "core", "abalone.tab" ], { "name": "abalone", "description": "Abalone is marine snails. This data set is about predicting abalone age from physical measurements. The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope - a boring and time-consuming task. Other measurements, which are easier to obtain, may be used to predict the age. The data set contains the attributes that report on sex, size and weight measurements. The age is reported as the number of rings, which increased by 1.5 gives the age in years.", "collection": "UCI", "version": "1.7", "year": 1994, "instances": 4177, "missing": 0, "variables": 9, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/abalone.tab", "domain": null, "language": "English", "target": "numeric", "location": "abalone.tab", "size": 191993, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Abalone" } ], [ [ "core", "adult.tab" ], { "name": "adult", "description": "Data extracted by Barry Becker from the 1994 Census database so that ((AAGE>16) & (AGI>100) & (AFNLWGT>1) & (HRSWK>0)). The prediction task is determining whether a person makes over 50K yearly. Regarding fairness, \"sex\" is frequently considered the protected attribute, with \"Male\" as the privileged value. Similarly, \"race\" and \"age\" can be treated as protected attributes. For \"race,\" \"Caucasian\" is the privileged value, and for \"age,\" the specified group is between 25 and 60 years.", "collection": "UCI", "version": "1.4", "year": 1996, "instances": 48842, "missing": 1, "variables": 15, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/adult.tab", "domain": null, "language": "English", "target": "categorical", "location": "adult.tab", "size": 5583317, "publication_status": 0, "tag": "economy", "tags": [ "economy", "fairness" ], "title": "Adult Census Income", "references": [ "Ron Kohavi (1996) Scaling Up the Accuracy of Naive-Bayes Classifiers: a Decision-Tree Hybrid. Proceedings of the Second International Conference on Knowledge Discovery and Data Mining." ] } ], [ [ "core", "ames-iowa-housing.xlsx" ], { "name": "ames-iowa-housing", "description": "The Ames Housing dataset contains 79 explanatory variables describing (almost) every aspect of housing in Ames, Iowa, with the goal of predicting sales prices. This dataset is a great alternative to the popular but older Boston Housing dataset. The data was compiled by Dean De Cock in 2011 for use in data science education.", "collection": "kaggle", "version": "1.0", "year": 2011, "instances": 2930, "missing": 1, "variables": 81, "source": "Ames Iowa Housing Data on kaggle", "url": "https://datasets.biolab.si/ames-iowa-housing.xlsx", "domain": null, "language": "English", "target": "numeric", "location": "ames-iowa-housing.xlsx", "size": 851185, "publication_status": 0, "tag": "economy", "tags": [ "economy" ], "title": "Ames Iowa Housing" } ], [ [ "core", "amphorae.tab" ], { "name": "amphorae", "description": "An introductory resource for the study of Roman amphorae. There are many types of amphorae - we kept the most well-documented ones, Dressel, Gauloise, and Keay. Each subtype has the corresponding metadata.", "collection": "Archaeology Data Service", "version": "1.0", "year": 2005, "instances": 164, "missing": 1, "variables": 16, "source": "Archaeology Data Service", "url": "https://datasets.biolab.si/amphorae.tab", "domain": null, "language": "English", "target": "categorical", "location": "amphorae.tab", "size": 24289, "publication_status": 0, "tag": "archaeology", "tags": [ "archaeology", "image analytics" ], "title": "Roman Amphorae" } ], [ [ "core", "attrition-predict.tab" ], { "name": "attrition-predict", "description": "Complimentary synthetic data set to be used for prediction with Attrition - Train. Data instances were created at Biolab for education purposes.", "collection": "IBM", "version": "1.0", "year": 2017, "instances": 3, "missing": 0, "variables": 19, "source": "", "url": "https://datasets.biolab.si/attrition-predict.tab", "domain": null, "language": "English", "target": "none", "location": "attrition-predict.tab", "size": 838, "publication_status": 0, "tag": "economy", "tags": [ "economy", "synthetic", "education" ], "title": "Attrition - Predict" } ], [ [ "core", "attrition-train.tab" ], { "name": "attrition-train", "description": "Uncover the factors that lead to employee attrition and explore important questions such as \u2018show me a breakdown of distance from home by job role and attrition\u2019 or \u2018compare average monthly income by education and attrition\u2019. This is a fictional data set created by IBM data scientists.", "collection": "IBM", "version": "1.0", "year": 2015, "instances": 1470, "missing": 0, "variables": 19, "source": "Watson Analytics Sample Data", "url": "https://datasets.biolab.si/attrition-train.tab", "domain": null, "language": "English", "target": "categorical", "location": "attrition-train.tab", "size": 186525, "publication_status": 0, "tag": "economy", "tags": [ "economy", "synthetic" ], "title": "Attrition - Train" } ], [ [ "core", "auto-mpg.tab" ], { "name": "auto-mpg", "description": "This dataset is a slightly modified version of the dataset provided in the StatLib library. In line with the use by Ross Quinlan (1993) in predicting the attribute 'mpg', 8 of the original instances were removed because they had unknown values for the 'mpg' attribute. 'The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes.' (Quinlan, 1993).", "collection": "UCI", "version": "1.1", "year": 1993, "instances": 398, "missing": 1, "variables": 9, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/auto-mpg.tab", "domain": null, "language": "English", "target": "numeric", "location": "auto-mpg.tab", "size": 17768, "publication_status": 0, "tag": null, "tags": [], "title": "Auto MPG", "references": [ "Quinlan, R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann." ] } ], [ [ "core", "bank-additional.tab" ], { "name": "bank-additional", "description": "Data from direct marketing campaigns (phone calls) of a Portuguese bank. The classification goal is to predict if the client will subscribe a term deposit given the profile of a client that contains attributes such as age, job type, martial status, education, information on previous loans, and other.", "collection": "UCI", "version": "1.0", "year": 2014, "instances": 4119, "missing": 1, "variables": 21, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/bank-additional.tab", "domain": null, "language": "English", "target": "categorical", "location": "bank-additional.tab", "size": 477308, "publication_status": 0, "tag": "economy", "tags": [ "economy" ], "title": "Bank Marketing", "references": [ "Moro S, Cortez P, and Rita P (2014) A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems 62:22-31." ] } ], [ [ "core", "banking-crises.tab" ], { "name": "banking-crises", "description": "This is a transformed STATA file reporting on the banking crises per year per country.", "collection": "The World Bank", "version": "1.0", "year": 2017, "instances": 211, "missing": 1, "variables": 73, "source": "The World Bank", "url": "https://datasets.biolab.si/banking-crises.tab", "domain": null, "language": "English", "target": "none", "location": "banking-crises.tab", "size": 32083, "publication_status": 0, "tag": "time", "tags": [ "time", "economy" ], "title": "Banking Crises" } ], [ [ "core", "bicycle-gravel-vs-mountain.xlsx" ], { "name": "bicycle-gravel-vs-mountain", "description": "Donated by Janez Dem\u0161ar, this dataset records his bike rides in the year 2023. Each ride is characterized by five different features and labeled with the type of bike Janez used.", "collection": "", "version": "1.0", "year": 2024, "instances": 98, "missing": 0, "variables": 6, "source": "", "url": "https://datasets.biolab.si/bicycle-gravel-vs-mountain.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "bicycle-gravel-vs-mountain.xlsx", "size": 13691, "publication_status": 0, "tag": "sport", "tags": [ "sport" ], "title": "Gravel vs Mountain Bike Rides: Data from Strava" } ], [ [ "core", "bicycle-time-distance.xlsx" ], { "name": "bicycle-time-distance", "description": "Donated by Janez Dem\u0161ar, this dataset records his bike rides in 2023, and includes the total time of the ride, the distance traveled, and the type of bike used.", "collection": "", "version": "1.0", "year": 2024, "instances": 98, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/bicycle-time-distance.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "bicycle-time-distance.xlsx", "size": 10859, "publication_status": 0, "tag": "sport", "tags": [ "sport" ], "title": "Gravel vs Mountain Bike Rides: Distance and Time" } ], [ [ "core", "body-fat-brozek.xlsx" ], { "name": "body-fat-brozek", "description": "Lists estimates of the percentage of body fat determined by underwater and includes various body circumference measurements for 252 men. Use the data to determine if body fat can be estimated from body measurements. The data were generously provided by Dr. A. Garth Fisher, Human Performance Research Center, Brigham Young University, Provo, Utah 84602, who granted permission for the data to be freely distributed and used for non-commercial purposes.", "collection": "kaggle", "version": "1.0", "year": 1985, "instances": 252, "missing": 0, "variables": 16, "source": "Body Fat Prediction Dataset on kaggle", "url": "https://datasets.biolab.si/body-fat-brozek.xlsx", "domain": null, "language": "English", "target": "numeric", "location": "body-fat-brozek.xlsx", "size": 66394, "publication_status": 0, "tag": "health", "tags": [ "health" ], "title": "Body Fat Prediction" } ], [ [ "core", "bone-healing.xlsx" ], { "name": "bone-healing", "description": "Microscope images of bone-fracture repair that highlight the involvement from skeletal stem cells. Images are from laboratory of Dongsu Park at Baylor College of Medicine, Houston.", "collection": "", "version": "1.0", "year": 2018, "instances": 37, "missing": 0, "variables": 9, "source": null, "url": "https://datasets.biolab.si/bone-healing.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "bone-healing.xlsx", "size": 11922, "publication_status": 0, "tag": "image analytics", "tags": [ "image analytics", "biology" ], "title": "Bone Healing" } ], [ [ "core", "brca_metabric.pkl.gz" ], { "name": "brca_metabric.pkl", "description": "Survival data for 1904 patients with primary breast tumors. Contains 35 clinical features and expression data for 24368 genes. Log-transformed mRNA z-Scores compared to the expression distribution of all samples (Illumina Human v3 microarray)", "collection": "", "version": "1.2", "year": 2021, "instances": 1904, "missing": 1, "variables": 24403, "source": "cBioPortal", "url": "https://datasets.biolab.si/brca_metabric.pkl.gz", "domain": null, "language": "English", "target": "none", "location": "brca_metabric.pkl.gz", "size": 142871615, "publication_status": 0, "tag": "gene expression", "tags": [ "gene expression", "survival analysis", "censoring" ], "title": "METABRIC: Molecular Taxonomy of Breast Cancer International Consortium", "references": [ "Curtis, Christina et al. \u201cThe genomic and transcriptomic architecture of 2,000 breast tumours reveals novel subgroups.\u201d Nature vol. 486,7403 346-52. 18 Apr. 2012, doi:10.1038/nature10983", "Rueda, Oscar M et al. \u201cDynamics of breast-cancer relapse reveal late-recurring ER-positive genomic subgroups.\u201d Nature vol. 567,7748 (2019): 399-404. doi:10.1038/s41586-019-1007-8", "Pereira, Bernard et al. \u201cThe somatic mutation profiles of 2,433 breast cancers refines their genomic and transcriptomic landscapes.\u201d Nature communications vol. 7 11479. 10 May. 2016, doi:10.1038/ncomms11479" ] } ], [ [ "core", "breast-cancer-wisconsin.tab" ], { "name": "breast-cancer-wisconsin", "description": "Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. Separating plane described above was obtained using Multisurface Method-Tree (MSM-T), a classification method which uses linear programming to construct a decision tree. Relevant features were selected using an exhaustive search in the space of 1-4 features and 1-3 separating planes.", "collection": "UCI", "version": "1.0", "year": 1992, "instances": 683, "missing": 0, "variables": 10, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/breast-cancer-wisconsin.tab", "domain": null, "language": "English", "target": "categorical", "location": "breast-cancer-wisconsin.tab", "size": 35787, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Breast Cancer Wisconsin", "references": [ "Wolberg, W.H., & Mangasarian, O.L. (1990). Multisurface method of pattern separation for medical diagnosis applied to breast cytology. In Proceedings of the National Academy of Sciences, 87, 9193-9196." ] } ], [ [ "core", "breast-cancer.tab" ], { "name": "breast-cancer", "description": "This is one of three domains provided by the Oncology Institute that has repeatedly appeared in the machine learning literature. This data set includes 201 instances of one class and 85 instances of another class. The instances are described by 9 attributes, some of which are linear and some are nominal.", "collection": "UCI", "version": "1.0", "year": 1992, "instances": 286, "missing": 1, "variables": 10, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/breast-cancer.tab", "domain": null, "language": "English", "target": "categorical", "location": "breast-cancer.tab", "size": 18801, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Breast Cancer", "references": [ "Michalski, R.S., Mozetic, I., Hong, J., & Lavrac, N. (1986). The Multi-Purpose Incremental Learning System AQ15 and its Testing Application to Three Medical Domains. In Proceedings of the Fifth National Conference on Artificial Intelligence, 1041-1045, Philadelphia, PA: Morgan Kaufmann." ] } ], [ [ "core", "bridges.tab" ], { "name": "bridges", "description": "This is a design domain where 5 properties (design description) need to be predicted based on 7 specification properties.", "collection": "UCI", "version": "1.0", "year": 1990, "instances": 108, "missing": 1, "variables": 11, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/bridges.tab", "domain": null, "language": "English", "target": "none", "location": "bridges.tab", "size": 6251, "publication_status": 0, "tag": "design", "tags": [ "design" ], "title": "Pittsburg Bridges", "references": [ "Reich & Fenves (1989). Incremental Learning for Capturing Design Expertise. Technical Report: EDRC 12-34-89, Engineering Design Research Center, Carnegie Mellon University, Pittsburgh, PA." ] } ], [ [ "core", "brown-selected.tab" ], { "name": "brown-selected", "description": "Gene expression of baker's yeast.", "collection": "", "version": "1.0", "year": null, "instances": 186, "missing": 1, "variables": 81, "source": "Brown for Orange", "url": "https://datasets.biolab.si/brown-selected.tab", "domain": null, "language": "English", "target": "categorical", "location": "brown-selected.tab", "size": 98010, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Baker's Yeast", "references": [ "Brown, M.P., Grundy, W.N., Lin, D., Cristianini, N., Sugnet, C., Furey, T.S., Ares, M., Haussler, D. (2000) Knowledge-based analysis of microarray gene expression data by using support vector machines, Proceedings of the National Academy of Sciences, 1 , 262-267." ] } ], [ [ "core", "bupa.tab" ], { "name": "bupa", "description": "The BUPA dataset contains 345 single male patients with 6 numeric attributes. Five of these attributes are blood tests which are thought to be sensitive to liver disorders that might arise from excessive alcohol consumption. Each line in the dataset constitutes the record of a single male individual.", "collection": "UCI", "version": "1.0", "year": 1990, "instances": 345, "missing": 0, "variables": 7, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/bupa.tab", "domain": null, "language": "English", "target": "categorical", "location": "bupa.tab", "size": 7369, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Liver Disorders", "references": [ "McDermott & Forsyth 2016, Diagnosing a disorder in a classification benchmark, Pattern Recognition Letters, Volume 73." ] } ], [ [ "core", "car.tab" ], { "name": "car", "description": "This is a synthetic data set derived from a simple hierarchical decision model to demonstrate decision support system DEX (see Bohanec and Rajkovic). The decision model included six attributes, including buying and maintenance price, the number of passengers, size of the luggage booth, and evaluated the utility of the car from a buyer's perspective. All attributes were discrete, having from three to four values. The data set provides car's utility for all possible combinations of attribute values. The data set was originally created to showcase the ability of machine learning by function decomposition to recreate the hierarchy of the decision model.", "collection": "UCI", "version": "1.0", "year": 1999, "instances": 1728, "missing": 0, "variables": 7, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/car.tab", "domain": null, "language": "English", "target": "categorical", "location": "car.tab", "size": 51939, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic" ], "title": "Car Evaluation", "references": [ "Bohanec M, Rajkovic V (1988) Knowledge acquisition and explanation for multi-attribute decision making. In 8th International Workshop on Expert Systems and their Applications, Avignon, France. pages 59-78.", "Zupan B, Bohanec M, Demsar J, Bratko I (1999) Learning by discovering concept hierarchies, Artificial Intelligence 109: 211-242." ] } ], [ [ "core", "climate-europe.tab" ], { "name": "climate-europe", "description": "Database of European cities, their climatic characteristics and associated climate. The database contains 11 variables describing the climatic characteristics of cities: record high temperature [\u00b0C], average high temperature [\u00b0C], daily average temperature [\u00b0C], average low temperature [\u00b0C], record low temperature [\u00b0C], average precipitation [mm], number of rainy days, number of snowy days, average relative humidity (%), average monthly sunshine hours, average ultraviolet index. For each city, it is then noted in which climate it is classified. Each city is also given its geographical longitude and latitude.", "collection": "UCI", "version": "1.0", "year": 2015, "instances": 41, "missing": 1, "variables": 16, "source": "", "url": "https://datasets.biolab.si/climate-europe.tab", "domain": null, "language": "English", "target": "none", "location": "climate-europe.tab", "size": 4478, "publication_status": 0, "tag": "geography", "tags": [ "geography" ], "title": "Climate of European cities" } ], [ [ "core", "compas-scores-two-years.tab" ], { "name": "compas-scores-two-years", "description": "With the COMPAS dataset, we can predict the likelihood of a defendant becoming a recidivist. The data contains various attributes: age, race, sex, priors count, charge degree, and more. The data was, in detail, analyzed by ProPublica, which identified racial disparities in its predictions. We have included all date/time features as meta attributes. When addressing fairness, \"race\" is often considered the protected attribute, with \"Caucasian\" being the privileged value. In some studies, \"sex\" is also treated as the protected attribute, designating \"female\" as the privileged value.", "collection": "ProPublica", "version": "1.0", "year": 2014, "instances": 7214, "missing": 1, "variables": 52, "source": "ProPublica GitHub Repository", "url": "https://datasets.biolab.si/compas-scores-two-years.tab", "domain": null, "language": "English", "target": "categorical", "location": "compas-scores-two-years.tab", "size": 2779697, "publication_status": 0, "tag": "criminal justice", "tags": [ "criminal justice", "fairness" ], "title": "COMPAS Analysis", "references": [ "Angwin, J., Larson, J., Mattu, S., & Kirchner, L. (2016). Machine Bias. ProPublica.", "Larson, J., Mattu, S., Kirchner, L., & Angwin, J. (2016). How We Analyzed the COMPAS Recidivism Algorithm. ProPublica." ] } ], [ [ "core", "conferences.tab" ], { "name": "conferences", "description": "Machine learning publications with author and paper counts.", "collection": "", "version": "1.0", "year": null, "instances": 42, "missing": 0, "variables": 5, "source": "", "url": "https://datasets.biolab.si/conferences.tab", "domain": null, "language": "English", "target": "none", "location": "conferences.tab", "size": 2337, "publication_status": 0, "tag": null, "tags": [], "title": "Conferences" } ], [ [ "core", "cyber-security-breaches.tab" ], { "name": "cyber-security-breaches", "description": "Reports on security breaches in the US states between 1997 and 2014. Data reports on type, individuals affects, location of the breach and entity involved.", "collection": "PRC", "version": "1.0", "year": 2014, "instances": 1055, "missing": 1, "variables": 10, "source": "Hype and Heavy Tails: a closer look at data breaches", "url": "https://datasets.biolab.si/cyber-security-breaches.tab", "domain": null, "language": "English", "target": "none", "location": "cyber-security-breaches.tab", "size": 230445, "publication_status": 0, "tag": "security", "tags": [ "security", "time", "geo" ], "title": "Cyber Security Breaches" } ], [ [ "core", "dermatology.tab" ], { "name": "dermatology", "description": "The differential diagnosis of erythemato-squamous diseases is a real problem in dermatology. They all share the clinical features of erythema and scaling, with very little differences. The diseases in this group are psoriasis, seboreic dermatitis, lichen planus, pityriasis rosea, chronic dermatitis, and pityriasis rubra pilaris. Usually a biopsy is necessary for the diagnosis but unfortunately these diseases share many histopathological features as well. Another difficulty for the differential diagnosis is that a disease may show the features of another disease at the beginning stage and may have the characteristic features at the following stages. Patients were first evaluated clinically with 12 features. Afterwards, skin samples were taken for the evaluation of 22 histopathological features. The values of the histopathological features are determined by an analysis of the samples under a microscope. In the dataset constructed for this domain, the family history feature has the value 1 if any of these diseases has been observed in the family, and 0 otherwise. The age feature simply represents the age of the patient. Every other feature (clinical and histopathological) was given a degree in the range of 0 to 3. Here, 0 indicates that the feature was not present, 3 indicates the largest amount possible, and 1, 2 indicate the relative intermediate values.", "collection": "UCI", "version": "1.0", "year": 1998, "instances": 366, "missing": 1, "variables": 35, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/dermatology.tab", "domain": null, "language": "English", "target": "categorical", "location": "dermatology.tab", "size": 31682, "publication_status": 0, "tag": "biology", "tags": [ "biology", "medical" ], "title": "Dermatology", "references": [ "G. Demiroz, H. A. Govenir, and N. Ilter, Learning Differential Diagnosis of Eryhemato-Squamous Diseases using Voting Feature Intervals, Aritificial Intelligence in Medicine." ] } ], [ [ "core", "dicty-development.xlsx" ], { "name": "dicty-development", "description": "Images of strains of the social amoeba Dictyostelium discoideum from Gad Shaulsky's Lab at Baylor College of Medicine, Houston, USA.", "collection": "", "version": "1.1", "year": 2018, "instances": 152, "missing": 0, "variables": 7, "source": "", "url": "https://datasets.biolab.si/dicty-development.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "dicty-development.xlsx", "size": 15904, "publication_status": 0, "tag": "image analytics", "tags": [ "image analytics", "biology" ], "title": "Development of Social Amoeba", "references": [ "Li CL, Santhanam B, Webb AN, Zupan B, Shaulsky G (2016) Gene discovery by chemical mutagenesis and whole-genome sequencing in Dictyostelium. Genome Res 26(9):1268-76." ] } ], [ [ "core", "ewba-slovenia-illegal-dumpsites.tab" ], { "name": "ewba-slovenia-illegal-dumpsites", "description": "Illegal waste dumpsites in Slovenia as tracked by Ecologists Without Borders Association. The data contains geographical location of the site and its description, including information about accessibility, access type, waste area and volume and types of waste deposited.", "collection": "", "version": "1.0", "year": 2017, "instances": 13165, "missing": 1, "variables": 29, "source": "Dumpsite Registry of Slovenia", "url": "https://datasets.biolab.si/ewba-slovenia-illegal-dumpsites.tab", "domain": null, "language": "English", "target": "none", "location": "ewba-slovenia-illegal-dumpsites.tab", "size": 2907714, "publication_status": 0, "tag": "geo", "tags": [ "geo", "timeseries", "ecology" ], "title": "Illegal waste dumpsites in Slovenia" } ], [ [ "core", "food-nutrition-info.tab" ], { "name": "food-nutrition-info", "description": "Nutritional information for selected raw fruits, vegetables and seafood. The data is borrowed from data.world, where it was created by Adam Helsinger. Where values are given in both mass and percent of daily value, the former is retained and the latter is moved to the meta-attribute.", "collection": "data.word", "version": "1.0", "year": 2017, "instances": 61, "missing": 0, "variables": 25, "source": "Food Nutrition Information @ data.world", "url": "https://datasets.biolab.si/food-nutrition-info.tab", "domain": null, "language": "English", "target": "categorical", "location": "food-nutrition-info.tab", "size": 6672, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Food Nutrition Information" } ], [ [ "core", "foodmart.basket" ], { "name": "foodmart", "description": "Foodmart 2000 is a market based dataset that came with Microsoft Analysis Services. For every transaction (rows) it contains tuples of item names and number of items bought. Every transaction also contains the store ID.", "collection": "", "version": "1.0", "year": 2005, "instances": 62560, "missing": 0, "variables": 126, "source": "neo4j data repository", "url": "https://datasets.biolab.si/foodmart.basket", "domain": null, "language": "English", "target": "none", "location": "foodmart.basket", "size": 4212566, "publication_status": 0, "tag": "economy", "tags": [ "economy", "associate", "basket" ], "title": "Foodmart 2000" } ], [ [ "core", "forestfires.tab" ], { "name": "forestfires", "description": "This is a difficult regression task, where the aim is to predict the burned area of forest fires in the northeast region of Portugal. The attributes report on meteorological data (temperature, wind, rain, humidity), month and day of the status, several indices of the Forest Fire Weather Index, and spatial coordinate within the Montesinho park map. Two extra meta attributes are include in the Orange data set that encode the log of area + 1 and the binary attribute reporting if the part of the park was under fire (non-zero fire area).", "collection": "UCI", "version": "1.2", "year": 2007, "instances": 517, "missing": 0, "variables": 15, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/forestfires.tab", "domain": null, "language": "English", "target": "numeric", "location": "forestfires.tab", "size": 32101, "publication_status": 0, "tag": "ecology", "tags": [ "ecology" ], "title": "Forest Fires", "references": [ "Cortez P and Morais A (2007) A Data Mining Approach to Predict Forest Fires using Meteorological Data. In Proc. of the 13th Portuguese Conference on Artificial Intelligence, Portugal, pp. 512-523." ] } ], [ [ "core", "gbsg2.tab" ], { "name": "gbsg2", "description": "Survival data for 686 female patients from the German Breast Cancer Study Group 2. Contains 10 features (Recurrence Free Survival Time, Recurrence Event, Menopausal Status, Age of the Patient, Tumor Grade, Hormonal Therapy, Tumor Size, Number of Positive Nodes, Progesterone Receptor, Estrogen Receptor).", "collection": "", "version": "1.2", "year": 1994, "instances": 686, "missing": 0, "variables": 10, "source": "Torsten Hothorn Data Archive", "url": "https://datasets.biolab.si/gbsg2.tab", "domain": null, "language": "English", "target": "none", "location": "gbsg2.tab", "size": 30294, "publication_status": 0, "tag": "survival analysis", "tags": [ "survival analysis", "censoring" ], "title": "German BC2: German Breast Cancer Study Group 2", "references": [ "W. Sauerbrei and P. Royston (1999). Building multivariable prognostic and diagnostic models: transformation of the predictors by using fractional polynomials. Journal of the Royal Statistics Society Series A, Volume 162(1), 71\u201394.", "M. Schumacher, G. Basert, H. Bojar, K. Huebner, M. Olschewski, W. Sauerbrei, C. Schmoor, C. Beyerle, R.L.A. Neumann and H.F. Rauschecker for the German Breast Cancer Study Group (1994), Randomized 2\times2 trial evaluating hormonal treatment and the duration of chemotherapy in node-positive breast cancer patients. Journal of Clinical Oncology, 12, 2086\u20132093." ] } ], [ [ "core", "german-credit-data.tab" ], { "name": "german-credit-data", "description": "The data categorizes individuals, based on various attributes, into good or bad credit risks. Attributes contain categorical and numeric features such as age, job, credit history, and personal status. In classification, misclassifying a \"bad\" customer as \"good\" is more problematic than misclassifying a good customer as bad. Regarding fairness, \"sex\" is the protected attribute, with \"male\" as the privileged value. Additionally, we can regard age as a protected attribute after binarizing it into two groups: <=25 and >25 years old.", "collection": "UCI", "version": "1.0", "year": 1994, "instances": 1000, "missing": 0, "variables": 21, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/german-credit-data.tab", "domain": null, "language": "English", "target": "categorical", "location": "german-credit-data.tab", "size": 181879, "publication_status": 0, "tag": "finance", "tags": [ "finance", "fairness" ], "title": "German Credit Data", "references": [ "Hofmann,Hans. (1994). Statlog (German Credit Data). UCI Machine Learning Repository. https://doi.org/10.24432/C5NC77." ] } ], [ [ "core", "glass.tab" ], { "name": "glass", "description": "Determine whether the glass was a type of 'float' glass or not. The study of classification of types of glass was motivated by criminological investigation. At the scene of the crime, the glass left can be used as evidence... if it is correctly identified!", "collection": "UCI", "version": "1.0", "year": 1987, "instances": 214, "missing": 0, "variables": 10, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/glass.tab", "domain": null, "language": "English", "target": "categorical", "location": "glass.tab", "size": 10625, "publication_status": 0, "tag": "physics", "tags": [ "physics", "criminology" ], "title": "Glass", "references": [ "I. W. Evett and E. J. Spiehler, Rule Induction in Forensic Science, KBS in Government, 1987, pp. 107-118." ] } ], [ [ "core", "grades-two.tab" ], { "name": "grades-two", "description": "A small data set with final grades from English and Mathematics that was hand-crafted to introduce Euclidean distance and hierarchical clustering.", "collection": "", "version": "1.0", "year": null, "instances": 12, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/grades-two.tab", "domain": null, "language": "English", "target": "none", "location": "grades-two.tab", "size": 265, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic", "educational" ], "title": "Grades for English and Math" } ], [ [ "core", "grades.xlsx" ], { "name": "grades", "description": "A small dataset with grades on seven courses (English, French, History, Algebra, Biology, Physics, Physical) that was handcrafted to introduce hierarchical clustering.", "collection": "", "version": "1.0", "year": null, "instances": 16, "missing": 0, "variables": 8, "source": "", "url": "https://datasets.biolab.si/grades.xlsx", "domain": null, "language": "English", "target": "none", "location": "grades.xlsx", "size": 9428, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic", "education" ], "title": "Course Grades" } ], [ [ "core", "hair-spectroscopy.tab" ], { "name": "hair-spectroscopy", "description": "An FTIR hyperspectral dataset of hair sections measured on ZnS IR transparent windows in transmission mode. This dataset is part of a study by Sandt et al. (2021) on the chemical composition of hair with special focus on the center part, which often presents a special compartment called the medulla. The data was collected and compiled by Dr. Christophe Sandt.", "collection": "spectral", "version": "1.0", "year": 2021, "instances": 3250, "missing": 0, "variables": 833, "source": "", "url": "https://datasets.biolab.si/hair-spectroscopy.tab", "domain": null, "language": "English", "target": "none", "location": "hair-spectroscopy.tab", "size": 19122825, "publication_status": 0, "tag": "spectral", "tags": [ "spectral", "hyperspectral" ], "title": "Hair section", "references": [ "Sandt C, Borondics F. A new typology of human hair medullas based on lipid composition analysis by synchrotron FTIR microspectroscopy. Analyst, 2021, 146, 3942-3954" ] } ], [ [ "core", "hdi-edu.tab" ], { "name": "hdi-edu", "description": "The Human Development Index (HDI) is a summary measure of average achievement in key dimensions of human development: a long and healthy life, knowledge, and decent living. The data includes HDI as reported in 2015 and selected related socioeconomic from countries worldwide. We have removed HDI-related variables and countries with a missing HDI score. This dataset is used in the \"Socio-economic characteristics of countries\" activity on the pumice.si website.", "collection": "UNDP Human Development Reports", "version": "1.0", "year": 2015, "instances": 188, "missing": 1, "variables": 53, "source": "Human Development Data (1990-2015)", "url": "https://datasets.biolab.si/hdi-edu.tab", "domain": null, "language": "English", "target": "none", "location": "hdi-edu.tab", "size": 55489, "publication_status": 0, "tag": "economy", "tags": [ "economy", "geography" ], "title": "HDI-edu" } ], [ [ "core", "hdi-slo.tab" ], { "name": "hdi-slo", "description": "Indeks \u010dlove\u0161kega razvoja (HDI) je skupno merilo povpre\u010dnih dose\u017ekov na klju\u010dnih podro\u010djih \u010dlove\u0161kega razvoja: dolgo in zdravo \u017eivljenje, znanje in dostojno \u017eivljenje. Podatki vklju\u010dujejo HDI, kot je bil sporo\u010den leta 2015, in izbrane povezane socialno-ekonomske podatke za dr\u017eave po vsem svetu. Odstranili smo spremenljivke ekvivalentne indeksu HDI in dr\u017eave z manjkajo\u010dim rezultatom HDI.", "collection": "UNDP Human Development Reports", "version": "1.3", "year": 2015, "instances": 188, "missing": 1, "variables": 53, "source": "Human Development Data (1990-2015)", "url": "https://datasets.biolab.si/hdi-slo.tab", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "none", "location": "hdi-slo.tab", "size": 50540, "publication_status": 0, "tag": "ekonomija", "tags": [ "ekonomija", "geografija" ], "title": "Indeks \u010dlove\u0161kega razvoja (HDI)" } ], [ [ "core", "heart_disease.tab" ], { "name": "heart_disease", "description": "This data uses a subset of 14 attributes from the Cleveland database. The 'goal' field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0).", "collection": "UCI", "version": "1.0", "year": 1988, "instances": 303, "missing": 1, "variables": 14, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/heart_disease.tab", "domain": null, "language": "English", "target": "categorical", "location": "heart_disease.tab", "size": 24031, "publication_status": 0, "tag": "biology", "tags": [ "biology", "medicine" ], "title": "Heart Disease", "references": [ "Detrano, R., Janosi, A., Steinbrunn, W., Pfisterer, M., Schmid, J., Sandhu, S., Guppy, K., Lee, S., & Froelicher, V. (1989). International application of a new probability algorithm for the diagnosis of coronary artery disease. American Journal of Cardiology, 64, 304-310." ] } ], [ [ "core", "housing.tab" ], { "name": "housing", "description": "This dataset contains information collected by the U.S Census Service concerning housing in the area of Boston. It was obtained from the StatLib archive and has been used extensively throughout the literature to benchmark algorithms. This data was originally a part of UCI Machine Learning Repository and has been removed. The objective is to predict the value of prices of the house using the given features.", "collection": "StatLib", "version": "1.0", "year": 1979, "instances": 506, "missing": 0, "variables": 14, "source": "StatLib", "url": "https://datasets.biolab.si/housing.tab", "domain": null, "language": "English", "target": "numeric", "location": "housing.tab", "size": 34742, "publication_status": 0, "tag": "economy", "tags": [ "economy" ], "title": "Housing", "references": [ "Harrison, D. and Rubinfeld, D.L. (1978) Hedonic prices and the demand for clean air. J. Environ. Economics and Management 5, 81\u2013102." ] } ], [ [ "core", "hrm-employee-attrition.xlsx" ], { "name": "hrm-employee-attrition", "description": "A fictional data set created by IBM data scientists to demonstrate the use of Watson Analytics. The data reports on factors such as employees' age, gender, salary, job role and satisfaction, and asks to relate these to attrition.", "collection": "IBM", "version": "1.0", "year": 2015, "instances": 1470, "missing": 0, "variables": 33, "source": "IBM Watson Analytics", "url": "https://datasets.biolab.si/hrm-employee-attrition.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "hrm-employee-attrition.xlsx", "size": 262432, "publication_status": 0, "tag": "economy", "tags": [ "economy", "synthetic" ], "title": "Employee attrition" } ], [ [ "core", "imports-85.tab" ], { "name": "imports-85", "description": "This data set includes the specification of cars imported to the US in 1985. The class variable is set to the price of the car but can be changed to other target variables. The data includes three types of entities: (a) the specification of an auto in terms of various characteristics, (b) its assigned insurance risk rating, (c) its normalized losses in use as compared to other cars. The second rating corresponds to the degree to which the auto is riskier than its price indicates. Cars are initially assigned a risk factor symbol associated with its price. Then, if it is more risky (or less), this symbol is adjusted by moving it up (or down) the scale. Actuaries call this process symboling. A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe. ", "collection": "UCI", "version": "1.0", "year": 1987, "instances": 205, "missing": 1, "variables": 26, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/imports-85.tab", "domain": null, "language": "English", "target": "numeric", "location": "imports-85.tab", "size": 26300, "publication_status": 0, "tag": "insurance", "tags": [ "insurance", "economy" ], "title": "Imports 1985" } ], [ [ "core", "ionosphere.tab" ], { "name": "ionosphere", "description": "This radar data was collected by a system in Goose Bay, Labrador. This system consists of a phased array of 16 high-frequency antennas with a total transmitted power on the order of 6.4 kilowatts. The targets were free electrons in the ionosphere. 'Good' radar returns are those showing evidence of some type of structure in the ionosphere. 'Bad' returns are those that do not; their signals pass through the ionosphere. Received signals were processed using an autocorrelation function whose arguments are the time of a pulse and the pulse number. There were 17 pulse numbers for the Goose Bay system. Instances in this database are described by 2 attributes per pulse number, corresponding to the complex values returned by the function resulting from the complex electromagnetic signal.", "collection": "UCI", "version": "1.0", "year": 1989, "instances": 351, "missing": 0, "variables": 33, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/ionosphere.tab", "domain": null, "language": "English", "target": "categorical", "location": "ionosphere.tab", "size": 76708, "publication_status": 0, "tag": "physics", "tags": [ "physics" ], "title": "Ionosphere", "references": [ "Sigillito, V. G., Wing, S. P., Hutton, L. V., & Baker, K. B. (1989). Classification of radar returns from the ionosphere using neural networks. Johns Hopkins APL Technical Digest, 10, 262-266." ] } ], [ [ "core", "iris.tab" ], { "name": "iris", "description": "The Iris flower data set or Fisher's Iris data set was introduced by the British statistician and biologist Ronald Fisher in his 1936 paper as an example of linear discriminant analysis. The data on length and width of petal and sepal leafs was actually collected by American botanist Edgar Anderson to quantify the morphologic variation of Iris flowers of three related species.", "collection": "UCI", "version": "1.0", "year": 1936, "instances": 150, "missing": 0, "variables": 5, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/iris.tab", "domain": null, "language": "English", "target": "categorical", "location": "iris.tab", "size": 4625, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Iris", "references": [ "R. A. Fisher (1936) The use of multiple measurements in taxonomic problems. Annals of Eugenics 7(2):179\u2013188." ] } ], [ [ "core", "kickstarter.tab" ], { "name": "kickstarter", "description": "Basic profiling of Kickstarter project pages at the time of the start of the campaign. The class label records if the project was founded. The data is on a small sample of Kickstarter projects whose campaigns started from January to April, 2016. Even though the attributes contain very basic information about the web pages, like the number of videos and images included, it is surprising that these are sufficient for solid prediction of success of the project.", "collection": "", "version": "1.0", "year": 2016, "instances": 1163, "missing": 0, "variables": 20, "source": "", "url": "https://datasets.biolab.si/kickstarter.tab", "domain": null, "language": "English", "target": "categorical", "location": "kickstarter.tab", "size": 219235, "publication_status": 0, "tag": "economy", "tags": [ "economy" ], "title": "Kickstarter projects" } ], [ [ "core", "lenses.tab" ], { "name": "lenses", "description": "The following example is taken from the world of ophthalmic optics. The aim is to determine whether a patient is suitable for contact lens wear and for which type of contacts.", "collection": "UCI", "version": "1.0", "year": 1990, "instances": 24, "missing": 0, "variables": 5, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/lenses.tab", "domain": null, "language": "English", "target": "categorical", "location": "lenses.tab", "size": 968, "publication_status": 0, "tag": "medical", "tags": [ "medical" ], "title": "Lenses", "references": [ "Cendrowska, J., PRISM: An algorithm for inducing modular rules, International Journal of Man-Machine Studies, 1987, 27, 349-370." ] } ], [ [ "core", "liver-cirrhosis.tab" ], { "name": "liver-cirrhosis", "description": "Liver is the organ in charge of food and drug detoxification, cholesterol, lipids and glycogen synthesis. In case of cirrhosis, some parts of the liver become isolated in nodules by fibrotic tissue rich in collagen. The infrared map was recorded at the intersection between several cirrhotic nodules. The Fourier transform infrared spectroscopy (FTIR) measurement was performed in trans-reflection mode from a 10 \u00b5m thick section at 8x8 \u00b5m\u00b2 spatial resolution. The data was imaged by dr. Christophe Sandt.", "collection": "spectral", "version": "1.0", "year": 2018, "instances": 1078, "missing": 0, "variables": 546, "source": "", "url": "https://datasets.biolab.si/liver-cirrhosis.tab", "domain": null, "language": "English", "target": "none", "location": "liver-cirrhosis.tab", "size": 3546498, "publication_status": 0, "tag": "spectral", "tags": [ "spectral", "hyperspectral" ], "title": "Liver cirrhosis - spectral image" } ], [ [ "core", "liver-spectroscopy.tab" ], { "name": "liver-spectroscopy", "description": "Data on cells measured with Fourier transform infrared spectroscopy (FTIR) and annotated according to the majority presence of a chemical compound (collagen, glycogen, lipids, or DNA) in that part of the cell. Each row represents the data on specific cell, with components of the spectra given in columns. The data was compiled by dr. Christophe Sandt.", "collection": "spectral", "version": "1.0", "year": 2017, "instances": 731, "missing": 0, "variables": 235, "source": "", "url": "https://datasets.biolab.si/liver-spectroscopy.tab", "domain": null, "language": "English", "target": "categorical", "location": "liver-spectroscopy.tab", "size": 1018671, "publication_status": 0, "tag": "spectral", "tags": [ "spectral" ], "title": "Liver spectroscopy (Collagen)" } ], [ [ "core", "lymphography.tab" ], { "name": "lymphography", "description": "This is a small medical dataset containing 148 instances. The task is to distinguish healthy patients from those with metastases or malignant lymphoma. This is one of three domains provided by the Oncology Institute that has repeatedly appeared in the machine learning literature.", "collection": "UCI", "version": "1.0", "year": 1988, "instances": 148, "missing": 0, "variables": 19, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/lymphography.tab", "domain": null, "language": "English", "target": "categorical", "location": "lymphography.tab", "size": 14939, "publication_status": 0, "tag": "medical", "tags": [ "medical" ], "title": "Lymphography", "references": [ "Cestnik, G., Konenenko, I., & Bratko, I. (1987). Assistant-86: A Knowledge-Elicitation Tool for Sophisticated Users. In I. Bratko & N. Lavrac (Eds.) Progress in Machine Learning, 31-45, Sigma Press." ] } ], [ [ "core", "market-basket.tab" ], { "name": "market-basket", "description": "Synthetic example of market basket for introducing association rules and frequent itemsets.", "collection": "", "version": "1.0", "year": null, "instances": 5, "missing": 1, "variables": 6, "source": "", "url": "https://datasets.biolab.si/market-basket.tab", "domain": null, "language": "English", "target": "none", "location": "market-basket.tab", "size": 99, "publication_status": 0, "tag": "economy", "tags": [ "economy", "synthetic" ], "title": "Market Basket" } ], [ [ "core", "melanoma.tab" ], { "name": "melanoma", "description": "Survival data for 205 patients with malignant melanoma, collected at Odense University Hospital in Denmark from years 1962 to 1977. Includes 7 features (Overall Survival Time, Event, Sex, Tumor Thickness, Ulceration, Age, Year of Operation).", "collection": "", "version": "1.2", "year": 1993, "instances": 205, "missing": 0, "variables": 7, "source": "Modern Applied Statistics with S (4th edition, 2002).", "url": "https://datasets.biolab.si/melanoma.tab", "domain": null, "language": "English", "target": "none", "location": "melanoma.tab", "size": 6557, "publication_status": 0, "tag": "survival analysis", "tags": [ "survival analysis", "censoring" ], "title": "Melanoma: Survival from Malignant Melanoma", "references": [ "P. K. Andersen, O. Borgan, R. D. Gill and N. Keiding (1993) Statistical Models based on Counting Processes. Springer." ] } ], [ [ "core", "monks-1.tab" ], { "name": "monks-1", "description": "Once upon a time, in July 1991, the monks of Corsendonk Priory were faced with a school held in their priory, namely the 2nd European Summer School on Machine Learning. After listening more than one week to a wide variety of learning algorithms, they felt rather confused: Which algorithm would be optimal? And which one to avoid? As a consequence of this dilemma, they created a simple task on which all learning algorithms ought to be compared: the three MONK's problems. The target concept associated with the 1st Monk's problem is the binary outcome of the logical formula: MONK-1: (a1 == a2) or (a5 == 1)", "collection": "UCI", "version": "1.0", "year": 1992, "instances": 556, "missing": 0, "variables": 7, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/monks-1.tab", "domain": null, "language": "English", "target": "categorical", "location": "monks-1.tab", "size": 12733, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic" ], "title": "MONK's 1", "references": [ "The MONK's Problems - A Performance Comparison of Different Learning Algorithms, by S.B. Thrun, J. Bala, E. Bloedorn, I. Bratko, B. Cestnik, J. Cheng, K. De Jong, S. Dzeroski, S.E. Fahlman, D. Fisher, R. Hamann, K. Kaufman, S. Keller, I. Kononenko, J. Kreuziger, R.S. Michalski, T. Mitchell, P. Pachowicz, Y. Reich H. Vafaie, W. Van de Welde, W. Wenzel, J. Wnek, and J. Zhang. Technical Report CS-CMU-91-197, Carnegie Mellon University, Dec. 1991." ] } ], [ [ "core", "monks-2.tab" ], { "name": "monks-2", "description": "Once upon a time, in July 1991, the monks of Corsendonk Priory were faced with a school held in their priory, namely the 2nd European Summer School on Machine Learning. After listening more than one week to a wide variety of learning algorithms, they felt rather confused: Which algorithm would be optimal? And which one to avoid? As a consequence of this dilemma, they created a simple task on which all learning algorithms ought to be compared: the three MONK's problems. The target concept associated with the 2nd Monk's problem is the binary outcome of the logical formula: MONK-2: EXACTLY TWO of {a1 = 1, a2 = 1, a3 = 1, a4 = 1, a5 = 1, a6 = 1}", "collection": "UCI", "version": "1.0", "year": 1992, "instances": 601, "missing": 0, "variables": 7, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/monks-2.tab", "domain": null, "language": "English", "target": "categorical", "location": "monks-2.tab", "size": 13754, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic" ], "title": "MONK's 2", "references": [ "The MONK's Problems - A Performance Comparison of Different Learning Algorithms, by S.B. Thrun, J. Bala, E. Bloedorn, I. Bratko, B. Cestnik, J. Cheng, K. De Jong, S. Dzeroski, S.E. Fahlman, D. Fisher, R. Hamann, K. Kaufman, S. Keller, I. Kononenko, J. Kreuziger, R.S. Michalski, T. Mitchell, P. Pachowicz, Y. Reich H. Vafaie, W. Van de Welde, W. Wenzel, J. Wnek, and J. Zhang. Technical Report CS-CMU-91-197, Carnegie Mellon University, Dec. 1991." ] } ], [ [ "core", "monks-3.tab" ], { "name": "monks-3", "description": "Once upon a time, in July 1991, the monks of Corsendonk Priory were faced with a school held in their priory, namely the 2nd European Summer School on Machine Learning. After listening more than one week to a wide variety of learning algorithms, they felt rather confused: Which algorithm would be optimal? And which one to avoid? As a consequence of this dilemma, they created a simple task on which all learning algorithms ought to be compared: the three MONK's problems. The target concept associated with the 3rd Monk's problem is the binary outcome of the logical formula: MONK-3: (a5 = 3 and a4 = 1) or (a5 /= 4 and a2 /= 3). In addition, 5% class noise was added to the training set.", "collection": "UCI", "version": "1.0", "year": 1992, "instances": 554, "missing": 0, "variables": 7, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/monks-3.tab", "domain": null, "language": "English", "target": "categorical", "location": "monks-3.tab", "size": 12674, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic" ], "title": "MONK's 3", "references": [ "The MONK's Problems - A Performance Comparison of Different Learning Algorithms, by S.B. Thrun, J. Bala, E. Bloedorn, I. Bratko, B. Cestnik, J. Cheng, K. De Jong, S. Dzeroski, S.E. Fahlman, D. Fisher, R. Hamann, K. Kaufman, S. Keller, I. Kononenko, J. Kreuziger, R.S. Michalski, T. Mitchell, P. Pachowicz, Y. Reich H. Vafaie, W. Van de Welde, W. Wenzel, J. Wnek, and J. Zhang. Technical Report CS-CMU-91-197, Carnegie Mellon University, Dec. 1991." ] } ], [ [ "core", "mtsamples.tab" ], { "name": "mtsamples", "description": "A sample of 950 medical transcriptions from three categories (from the original 40) -- Cardiovascular/Pulmonary, Neurology, and Orthopedic. Each document is a transcription of a medical report. Each document is equipped with keywords for potential keyword analysis.", "collection": "", "version": "1.0", "year": 2023, "instances": 950, "missing": 0, "variables": 5, "source": "https://mtsamples.com/", "url": "https://datasets.biolab.si/mtsamples.tab", "domain": null, "language": "English", "target": "categorical", "location": "mtsamples.tab", "size": 3455067, "publication_status": 0, "tag": "text", "tags": [ "text", "classification", "medical" ], "title": "mtsamples" } ], [ [ "core", "multitarget-synthetic.tab" ], { "name": "multitarget-synthetic", "description": "Synthetic data set for multitarget regression.", "collection": "", "version": "1.0", "year": null, "instances": 100, "missing": 0, "variables": 7, "source": "", "url": "https://datasets.biolab.si/multitarget-synthetic.tab", "domain": null, "language": "English", "target": "none", "location": "multitarget-synthetic.tab", "size": 4486, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic" ], "title": "Multitarget Synthetic" } ], [ [ "core", "nba.tab" ], { "name": "nba", "description": "Database of basketball players and their characteristics. The database contains 16 variables describing the characteristics of each player: Position, Team, Age, Height, Weight, Hand, 3P%, 2P%, FT%, TRB, AST, STL, BLK, TOV, PF, PTS.", "collection": "", "version": "1.0", "year": 2023, "instances": 248, "missing": 1, "variables": 17, "source": "Basketball References NBA 2023 (January 2023, mid season 2022/23)", "url": "https://datasets.biolab.si/nba.tab", "domain": null, "language": "English", "target": "categorical", "location": "nba.tab", "size": 21748, "publication_status": 0, "tag": "sport", "tags": [ "sport" ], "title": "NBA Players" } ], [ [ "core", "oocyte-competence.xlsx" ], { "name": "oocyte-competence", "description": "Fluorescence images of the nucleus of mouse fully-grown antral oocytes from University of Pavia.", "collection": "", "version": "1.0", "year": 2018, "instances": 131, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/oocyte-competence.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "oocyte-competence.xlsx", "size": 9707, "publication_status": 0, "tag": "image analytics", "tags": [ "image analytics", "biology" ], "title": "Mammalian Oocyte Developmental Competence" } ], [ [ "core", "philadelphia-crime.csv.xz" ], { "name": "philadelphia-crime.csv", "description": "Data from minor criminal acts in Philadelphia.", "collection": "", "version": "1.0", "year": 2016, "instances": 9666, "missing": 0, "variables": 4, "source": "", "url": "https://datasets.biolab.si/philadelphia-crime.csv.xz", "domain": null, "language": "English", "target": "none", "location": "philadelphia-crime.csv.xz", "size": 92660, "publication_status": 0, "tag": "criminology", "tags": [ "criminology", "time", "geo" ], "title": "Philadelphia Crime" } ], [ [ "core", "pima-diabetes.xlsx" ], { "name": "pima-diabetes", "description": "The goal of the dataset is to assess if we can predict whether or not a patient has diabetes based on diagnostic measures that include the number of pregnancies the person has had, their body mass index, insulin levels, and age. Several restrictions were placed on the selection of subjects for this study. Specifically, all patients are women at least 21 years of age of Pima Indian descent.", "collection": "UCI", "version": "1.3", "year": 1988, "instances": 768, "missing": 1, "variables": 9, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/pima-diabetes.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "pima-diabetes.xlsx", "size": 43830, "publication_status": 0, "tag": "medicine", "tags": [ "medicine" ], "title": "Pima Indians Diabetes", "references": [ "Smith, J.W., Everhart, J.E., Dickson, W.C., Knowler, W.C., & Johannes, R.S. (1988). Using the ADAP learning algorithm to forecast the onset of diabetes mellitus. In Proc. of the Symposium on Computer Applications and Medical Care (pp. 261--265). IEEE Computer Society Press." ] } ], [ [ "core", "podnebje.tab" ], { "name": "podnebje", "description": "Zbirka podatkov o mestih, njihovih podnebnih lastnosti in pripadajo\u010de podnebje. Zbirka vsebuje 11 spremenljivk, ki opisujejo podnebne lastnosti mest: rekordno visoka temperatura [\u00b0C], povpre\u010dno visoka temperatura [\u00b0C], dnevno temperaturno povpre\u010dje [\u00b0C], povpre\u010dno nizka temperatura [\u00b0C], rekordno nizka temperatura [\u00b0C], povpre\u010dne padavine [mm], \u0161tevilo de\u017eevnih dni, \u0161tevilo sne\u017eenih dni, povpre\u010dna relativna vlaga (%), povpre\u010dne mese\u010dne ure sonca, povpre\u010dni ultravijoli\u010dni indeks. Za vsako mesto je nato zapisano, v katero podnebje ga uvr\u0161\u010damo. Vsako mesto ima zapisano tudi svojo zemljepisno dol\u017eino in \u0161irino.", "collection": "UCI", "version": "1.1", "year": 2015, "instances": 41, "missing": 1, "variables": 16, "source": "", "url": "https://datasets.biolab.si/podnebje.tab", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "none", "location": "podnebje.tab", "size": 4578, "publication_status": 0, "tag": "geografija", "tags": [ "geografija" ], "title": "Podnebje evropskih mest" } ], [ [ "core", "poker-hand.tab" ], { "name": "poker-hand", "description": "Each data instance is an example of a Poker hand consisting of five playing cards drawn from a standard deck of 52. Each card is described using two attributes (suit and rank), for a total of 10 predictive attributes. A suit is a discrete attribute (from 1 to 4) representing Hearts, Spades, Diamonds, and Clubs. Rank of the card is an integer-valued attribute with values from 1 to 13 representing Ace, 2, 3, ... , Queen, and King. The discrete class is an integer-encoded poker hand, representing nothing at hand (0), one pair (1), two pairs (2), ..., straight flush (8) and royal flush (9). UCI ML repository contains two data sets (train and test) that we merged into one, with meta attribute reporting on the file origin of each data instance.", "collection": "UCI", "version": "1.0", "year": 2007, "instances": 1025010, "missing": 0, "variables": 12, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/poker-hand.tab", "domain": null, "language": "English", "target": "categorical", "location": "poker-hand.tab", "size": 30302187, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic" ], "title": "Poker Hand" } ], [ [ "core", "political_regimes.tab" ], { "name": "political_regimes", "description": "Classification of political regimes as democracy and dictatorship for 202 countries from 1946 or year of independence to 2008. Classification of democracies as parliamentary, semi-presidential (mixed) and presidential. Classification of dictatorships as military, civilian and royal. Includes 1808 instances with 12 features.", "collection": "", "version": "1.2", "year": 2010, "instances": 1808, "missing": 0, "variables": 11, "source": "lifelines", "url": "https://datasets.biolab.si/political_regimes.tab", "domain": null, "language": "English", "target": "none", "location": "political_regimes.tab", "size": 185318, "publication_status": 0, "tag": "survival analysis", "tags": [ "survival analysis", "censoring" ], "title": "Democracy-Dictatorship: Classification of political regimes as democracy and dictatorship", "references": [ "Cheibub, Jos\u00e9 Antonio, Jennifer Gandhi, and James Raymond Vreeland. 2010. 'Democracy and Dictatorship Revisited.' Public Choice, vol. 143, no. 2-1, pp. 67-101." ] } ], [ [ "core", "priimki-slovenija.tab" ], { "name": "priimki-slovenija", "description": "Anonimiziran (za\u0161umljen) vzorec podatkov o lokacijah bivali\u0161\u010d oseb z 200 najpogostej\u0161imi slovenskimi priimki.", "collection": "", "version": "1.2", "year": 2023, "instances": 103530, "missing": 0, "variables": 4, "source": "", "url": "https://datasets.biolab.si/priimki-slovenija.tab", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "none", "location": "priimki-slovenija.tab", "size": 5733576, "publication_status": 0, "tag": "geo", "tags": [ "geo" ], "title": "Priimki po Sloveniji" } ], [ [ "core", "priimki-v-obcinah.tab" ], { "name": "priimki-v-obcinah", "description": "Vzorec \u0161tevila prebivalcev z 200 najpogostej\u0161imi priimki po slovenskih ob\u010dinah.", "collection": "", "version": "1.6", "year": 2023, "instances": 192, "missing": 0, "variables": 208, "source": "", "url": "https://datasets.biolab.si/priimki-v-obcinah.tab", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "none", "location": "priimki-v-obcinah.tab", "size": 175523, "publication_status": 0, "tag": "geo", "tags": [ "geo" ], "title": "Pogostosti priimkov po ob\u010dinah" } ], [ [ "core", "primary-tumor.tab" ], { "name": "primary-tumor", "description": "This is a dataset about primary tumors in people. Locations of primary tumors are locations in body where the tumor first appeared and from there started to metastasize to other parts of the body. This is one of three domains provided by the Oncology Institute that has repeatedly appeared in the machine learning literature.", "collection": "UCI", "version": "1.0", "year": 1988, "instances": 339, "missing": 1, "variables": 18, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/primary-tumor.tab", "domain": null, "language": "English", "target": "categorical", "location": "primary-tumor.tab", "size": 24401, "publication_status": 0, "tag": "medical", "tags": [ "medical" ], "title": "Primary Tumor", "references": [ "Cestnik, G., Konenenko, I., & Bratko, I. (1987). Assistant-86: A Knowledge-Elicitation Tool for Sophisticated Users. In I. Bratko & N. Lavrac (Eds.) Progress in Machine Learning, 31-45, Sigma Press." ] } ], [ [ "core", "promoters.tab" ], { "name": "promoters", "description": "E. Coli promoter gene sequences (DNA) with partial domain theory.", "collection": "UCI", "version": "1.0", "year": 1990, "instances": 106, "missing": 0, "variables": 58, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/promoters.tab", "domain": null, "language": "English", "target": "categorical", "location": "promoters.tab", "size": 13216, "publication_status": 0, "tag": "molecular biology", "tags": [ "molecular biology" ], "title": "Promoter Gene Sequences", "references": [ "Harley, C. and Reynolds, R. 1987. Analysis of E. Coli Promoter Sequences. Nucleic Acids Research, 15:2343-2361." ] } ], [ [ "core", "retail-basket.tab" ], { "name": "retail-basket", "description": "This is a transnational data set which contains all the transactions occurring between 2010-12-1 and 2011-12-09 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.", "collection": "UCI", "version": "1.0", "year": 2012, "instances": 914, "missing": 1, "variables": 50, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/retail-basket.tab", "domain": null, "language": "English", "target": "none", "location": "retail-basket.tab", "size": 92519, "publication_status": 0, "tag": "economy", "tags": [ "economy", "timeseries", "geo", "sparse" ], "title": "Online Retail", "references": [ "Daqing Chen, Sai Liang Sain, and Kun Guo (2012) Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining. Journal of Database Marketing and Customer Strategy Management 19(3), pp. 197." ] } ], [ [ "core", "sa-edu-TCGA-BRCA.pkl" ], { "name": "sa-edu-TCGA-BRCA", "description": "Survival dataset from The Cancer Genome Atlas Program (TCGA) consisting of 1119 samples from patients with breast cancer. Besides the survival time and event features it contains expression data for 22826 genes. Gene expression counts are stored as transcripts per million (TPM).", "collection": "", "version": "1.0", "year": 2015, "instances": 1119, "missing": 0, "variables": 22829, "source": "", "url": "https://datasets.biolab.si/sa-edu-TCGA-BRCA.pkl", "domain": null, "language": "English", "target": "none", "location": "sa-edu-TCGA-BRCA.pkl", "size": 206502504, "publication_status": 0, "tag": "survival analysis", "tags": [ "survival analysis", "gene expression", "censoring" ], "title": "SA: TCGA-BRCA" } ], [ [ "core", "sa-edu-TCGA-CESC.tab" ], { "name": "sa-edu-TCGA-CESC", "description": "Survival dataset from The Cancer Genome Atlas Program (TCGA) consisting of 306 samples from patients with cervival cancer. Besides the survival time and event features it contains expression data for 23368 genes. Gene expression counts are stored as transcripts per million (TPM).", "collection": "", "version": "1.0", "year": 2015, "instances": 306, "missing": 0, "variables": 23371, "source": "", "url": "https://datasets.biolab.si/sa-edu-TCGA-CESC.tab", "domain": null, "language": "English", "target": "none", "location": "sa-edu-TCGA-CESC.tab", "size": 108018572, "publication_status": 0, "tag": "survival analysis", "tags": [ "survival analysis", "gene expression", "censoring" ], "title": "SA: TCGA-CESC" } ], [ [ "core", "sa-edu-dental-fillings.tab" ], { "name": "sa-edu-dental-fillings", "description": "Toy survival dataset containing data from 20 people regarding how long their dental fillings stayed in place over an observation window of 10 years. It includes five features: Time, Event (dental filling falls out = 1, censoring = 0), Type of material (composite, ceramic), Brushing time (time spent on dental hygiene daily) and Cat or dog person (cat, dog).", "collection": "", "version": "1.5", "year": 2023, "instances": 20, "missing": 0, "variables": 6, "source": "", "url": "https://datasets.biolab.si/sa-edu-dental-fillings.tab", "domain": null, "language": "English", "target": "none", "location": "sa-edu-dental-fillings.tab", "size": 754, "publication_status": 0, "tag": "survival analysis", "tags": [ "survival analysis", "censoring" ], "title": "SA: Dental Fillings Dataset" } ], [ [ "core", "sailing.tab" ], { "name": "sailing", "description": "Hand-crafted data set to explain inference of classification trees. It records data for Sara, a weekend sailer, and the data before sailing on company, boat type and weather. The class tells if she actually went sailing.", "collection": "", "version": "1.0", "year": 2003, "instances": 20, "missing": 0, "variables": 4, "source": "", "url": "https://datasets.biolab.si/sailing.tab", "domain": null, "language": "English", "target": "categorical", "location": "sailing.tab", "size": 455, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic" ], "title": "Sailing" } ], [ [ "core", "single-cell-aml-1k.pkl.gz" ], { "name": "single-cell-aml-1k.pkl", "description": "Gene expressions in bone marrow mononuclear cells from a patient with acute myeloid leukemia (AML) and two healthy donors used as controls. The data includes a sample of 1000 cells and 1000 genes with the highest dispersion. This is a sample data that comes with Loupe Cell Browser, and includes cells from three separate experiments with data sets published on 10x Genomics single-cell data sets page: AML027 Pre-transplant BMMCs, Frozen BMMCs (Healthy Control 1), and Frozen BMMCs (Healthy Control 2).", "collection": "10x Genomics", "version": "1.0", "year": 2017, "instances": 1000, "missing": 0, "variables": 1004, "source": "10x Genomics", "url": "https://datasets.biolab.si/single-cell-aml-1k.pkl.gz", "domain": null, "language": "English", "target": "categorical", "location": "single-cell-aml-1k.pkl.gz", "size": 341970, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Bone marrow mononuclear cells with AML (sample)", "references": [ "Zheng, G. X., Terry, J. M., ... Gregory, M. T. (2017). Massively parallel digital transcriptional profiling of single cells. Nature communications, 8, 14049." ] } ], [ [ "core", "slovenia-traffic-accidents-2016-events.tab" ], { "name": "slovenia-traffic-accidents-2016-events", "description": "Traffic accidents in Slovenia in year 2016 as published by the Ministry of Internal Affairs. Events (rows) are described through location, cause and type of accident, and road condition. The data on geographic location is provided.", "collection": "", "version": "1.0", "year": 2016, "instances": 17931, "missing": 1, "variables": 21, "source": "Archive of Slovene Ministry of Interior", "url": "https://datasets.biolab.si/slovenia-traffic-accidents-2016-events.tab", "domain": null, "language": "English", "target": "none", "location": "slovenia-traffic-accidents-2016-events.tab", "size": 4501650, "publication_status": 0, "tag": "geo", "tags": [ "geo", "timeseries" ], "title": "Traffic accidents - events" } ], [ [ "core", "slovenia-traffic-accidents-2016-persons.tab" ], { "name": "slovenia-traffic-accidents-2016-persons", "description": "Persons involved in traffic accidents in Slovenia in year 2016 as published by the Ministry of Internal Affairs. The data includes geographic location of the accident, and profile of a person involved containing the age, gender, type of accident, the result of an alcohol test, and an indicator if the person caused the accident.", "collection": "", "version": "1.0", "year": 2016, "instances": 32857, "missing": 1, "variables": 13, "source": "Archive of Slovene Ministry of Interior", "url": "https://datasets.biolab.si/slovenia-traffic-accidents-2016-persons.tab", "domain": null, "language": "English", "target": "none", "location": "slovenia-traffic-accidents-2016-persons.tab", "size": 2915308, "publication_status": 0, "tag": "geo", "tags": [ "geo", "timeseries" ], "title": "Traffic accidents - persons" } ], [ [ "core", "slovenian-national-assembly-eng.tab" ], { "name": "slovenian-national-assembly-eng", "description": "Ballot counts and Member of parliament (MP) description data set. Ballot counts were retrieved for selected May and June 2017 parliament sessions. ", "collection": "Parlameter", "version": "1.0", "year": 2017, "instances": 84, "missing": 1, "variables": 32, "source": "Parlameter API", "url": "https://datasets.biolab.si/slovenian-national-assembly-eng.tab", "domain": null, "language": "English", "target": "none", "location": "slovenian-national-assembly-eng.tab", "size": 18190, "publication_status": 0, "tag": "image analytics", "tags": [ "image analytics", "politics" ], "title": "Slovenian National Assembly" } ], [ [ "core", "slovenska-naselja.tab" ], { "name": "slovenska-naselja", "description": "Zemljepisne koordinate, ob\u010dina, pokrajina, povr\u0161ina, \u0161tevilo prebivalcev in nadmorska vi\u0161ina za (skoraj) vsa naselja v Sloveniji", "collection": "", "version": "1.1", "year": 2023, "instances": 6002, "missing": 1, "variables": 8, "source": "https://sl.wikipedia.org/wiki/Seznam_naselij_v_Sloveniji", "url": "https://datasets.biolab.si/slovenska-naselja.tab", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "none", "location": "slovenska-naselja.tab", "size": 492354, "publication_status": 0, "tag": "geo", "tags": [ "geo" ], "title": "Slovenska naselja" } ], [ [ "core", "telecom-customer-churn.xlsx" ], { "name": "telecom-customer-churn", "description": "The dataset provides customer data to predict churn and develop retention strategies. Each row represents a customer, and columns detail attributes such as demographics (gender, age range, presence of partners or dependents), services subscribed (phone, internet, tech support, streaming, etc.), and account information (tenure, contract type, payment method, monthly and total charges). The target variable, \"Churn,\" indicates whether the customer left within the last month, making the dataset ideal for analyzing behavior patterns and crafting focused retention programs.", "collection": "UCI", "version": "1.2", "year": 2018, "instances": 7043, "missing": 1, "variables": 21, "source": "Telco Customer Churn", "url": "https://datasets.biolab.si/telecom-customer-churn.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "telecom-customer-churn.xlsx", "size": 722015, "publication_status": 0, "tag": "churn", "tags": [ "churn", "telecom" ], "title": "Telecom Customer Churn" } ], [ [ "core", "titanic.tab" ], { "name": "titanic", "description": "This data set provides information on the fate of passengers on the fatal maiden voyage of the ocean liner Titanic, summarized according to economic status (class), sex, age and survival.", "collection": "R", "version": "1.0", "year": null, "instances": 2201, "missing": 0, "variables": 4, "source": "", "url": "https://datasets.biolab.si/titanic.tab", "domain": null, "language": "English", "target": "categorical", "location": "titanic.tab", "size": 45112, "publication_status": 0, "tag": null, "tags": [], "title": "Titanic", "references": [ "Dawson Robert J. MacG. (1995) The \u2018Unusual Episode\u2019 Data Revisited. Journal of Statistics Education 3(3)." ] } ], [ [ "core", "traffic-signs.tab" ], { "name": "traffic-signs", "description": "Images (icons) of traffic signs.", "collection": "", "version": "1.0", "year": 2017, "instances": 70, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/traffic-signs.tab", "domain": null, "language": "English", "target": "categorical", "location": "traffic-signs.tab", "size": 3726, "publication_status": 0, "tag": "image analytics", "tags": [ "image analytics" ], "title": "Traffic signs" } ], [ [ "core", "vehicle.tab" ], { "name": "vehicle", "description": "The purpose is to classify a given silhouette as one of four types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.", "collection": "UCI", "version": "1.0", "year": 1986, "instances": 846, "missing": 0, "variables": 19, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/vehicle.tab", "domain": null, "language": "English", "target": "categorical", "location": "vehicle.tab", "size": 55560, "publication_status": 0, "tag": null, "tags": [], "title": "Vehicle Silhouettes", "references": [ "Siebert, JP. (1987) Vehicle Recognition Using Rule Based Methods, Turing Institute Research Memorandum TIRM-87-018." ] } ], [ [ "core", "veteran.tab" ], { "name": "veteran", "description": "Randomized trial of two treatment regimens for lung cancer. This is a standard survival analysis data set. Includes 137 instances with 8 features (Survival Time, Survival Event, Cell Type, Time from Diagnosis, Treatment, Age, Karnofsky performance score, Prior Therapy).", "collection": "", "version": "1.2", "year": 1980, "instances": 137, "missing": 0, "variables": 8, "source": "A package for survival analysis in R", "url": "https://datasets.biolab.si/veteran.tab", "domain": null, "language": "English", "target": "none", "location": "veteran.tab", "size": 5490, "publication_status": 0, "tag": "survival analysis", "tags": [ "survival analysis", "censoring" ], "title": "Veterans Administration Lung Cancer Study", "references": [ "D Kalbfleisch and RL Prentice (1980), The Statistical Analysis of Failure Time Data. Wiley, New York." ] } ], [ [ "core", "voting.tab" ], { "name": "voting", "description": "This data set includes votes for each of the U.S. House of Representatives Congressmen on the 16 key votes identified by the CQA. The CQA lists nine different types of votes: voted for, paired for, and announced for (these three simplified to yea), voted against, paired against, and announced against (these three simplified to nay), voted present, voted present to avoid conflict of interest, and did not vote or otherwise make a position known (these three simplified to an unknown disposition).", "collection": "UCI", "version": "1.0", "year": 1987, "instances": 435, "missing": 1, "variables": 17, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/voting.tab", "domain": null, "language": "English", "target": "categorical", "location": "voting.tab", "size": 18212, "publication_status": 0, "tag": "politics", "tags": [ "politics" ], "title": "Congressional Voting Records", "references": [ "Schlimmer, J. C. (1987). Concept acquisition through representational adjustment. Doctoral dissertation, Department of Information and Computer Science, University of California, Irvine, CA." ] } ], [ [ "core", "wine.tab" ], { "name": "wine", "description": "This is the data on wines grown in the same region in Italy but derived from three different cultivars. Wines are profiled by chemical analysis that reports on the quantities of thirteen constituents, including alcohol, malic acid, and flavanoids.", "collection": "UCI", "version": "1.0", "year": 1992, "instances": 178, "missing": 0, "variables": 14, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/wine.tab", "domain": null, "language": "English", "target": "categorical", "location": "wine.tab", "size": 10991, "publication_status": 0, "tag": "chemistry", "tags": [ "chemistry" ], "title": "Wine" } ], [ [ "core", "winequality-red.tab" ], { "name": "winequality-red", "description": "The quality of the red variants of the Portuguese \"Vinho Verde\" wine. Wine samples are described with results of physicochemical tests that include information on acidity, sugar and alcohol content, density and other. This data set is related to its sister data set on the quality of white wines.", "collection": "UCI", "version": "1.0", "year": 2009, "instances": 1599, "missing": 0, "variables": 12, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/winequality-red.tab", "domain": null, "language": "English", "target": "numeric", "location": "winequality-red.tab", "size": 84183, "publication_status": 0, "tag": null, "tags": [], "title": "Wine quality - red", "references": [ "Cortez P, Cerdeira A, Almeida F, Matos T, and Reis J (2009) Modeling wine preferences by data mining from physicochemical properties, Decision Support Systems Elsevier 47(4):547-553." ] } ], [ [ "core", "winequality-white.tab" ], { "name": "winequality-white", "description": "The quality of the white variants of the Portuguese \"Vinho Verde\" wine. Wine samples are described with results of physicochemical tests that include information on acidity, sugar and alcohol content, density and other. This data set is related to its sister data set on the quality of red wines.", "collection": "UCI", "version": "1.0", "year": 2009, "instances": 4898, "missing": 0, "variables": 12, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/winequality-white.tab", "domain": null, "language": "English", "target": "numeric", "location": "winequality-white.tab", "size": 264270, "publication_status": 0, "tag": null, "tags": [], "title": "Wine quality - white", "references": [ "Cortez P, Cerdeira A, Almeida F, Matos T, and Reis J (2009) Modeling wine preferences by data mining from physicochemical properties, Decision Support Systems Elsevier 47(4):547-553." ] } ], [ [ "core", "words-food.xlsx" ], { "name": "words-food", "description": "English words for fruit, vegetables and dairy products.", "collection": "", "version": "1.1", "year": 2024, "instances": 108, "missing": 0, "variables": 2, "source": "", "url": "https://datasets.biolab.si/words-food.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "words-food.xlsx", "size": 10293, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic", "text" ], "title": "Food Words" } ], [ [ "core", "words.xlsx" ], { "name": "words", "description": "ChatGPT-generated dataset of random English words, in the alphabetical order.", "collection": "", "version": "1.2", "year": 2023, "instances": 150, "missing": 0, "variables": 1, "source": "", "url": "https://datasets.biolab.si/words.xlsx", "domain": null, "language": "English", "target": "none", "location": "words.xlsx", "size": 10854, "publication_status": 0, "tag": "synthetic", "tags": [ "synthetic", "text" ], "title": "English Words" } ], [ [ "core", "yplp.xlsx" ], { "name": "yplp", "description": "Yeast protein localization images from YPL+ database. Included are only images with unique (single) localization site. The data set includes only localization sites that were reported for at least 30 distinct proteins.", "collection": "YPL+.db", "version": "1.0", "year": 2018, "instances": 2569, "missing": 0, "variables": 4, "source": "YPL+.db", "url": "https://datasets.biolab.si/yplp.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "yplp.xlsx", "size": 93607, "publication_status": 0, "tag": "image analytics", "tags": [ "image analytics" ], "title": "Yeast Protein Localization" } ], [ [ "core", "zivali-v-finscini.tab" ], { "name": "zivali-v-finscini", "description": "Zbirka podatkov o \u017eivalih, njihovih lastnosti in pripadajo\u010di vrsti - v fin\u0161\u010dini. Uporabljamo jo lahko, da predstavimo, kako je videti analiza podatkov 'na slepo', kot jo mora izvajati ra\u010dunalnik, na primer pri sestavljanju klasifikacijskih dreves.", "collection": "UCI", "version": "1.3", "year": 1990, "instances": 100, "missing": 1, "variables": 18, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/zivali-v-finscini.tab", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "categorical", "location": "zivali-v-finscini.tab", "size": 9415, "publication_status": 0, "tag": "biologija", "tags": [ "biologija" ], "title": "\u017divali v fin\u0161\u010dini" } ], [ [ "core", "zivali-z-neznanimi.tab" ], { "name": "zivali-z-neznanimi", "description": "Zbirka podatkov o \u017eivalih, njihovih lastnosti in pripadajo\u010di vrsti. Zbirka vsebuje 17 spremenljivk, ki opisujejo lastnosti \u017eivali: ali ima dlako, perje, vali jajca, daje mleko, leti, je vodna, je plenilec, ima zobe, hrbtenico, diha, je strupena, ima plavuti, noge, rep, je doma\u010da, majhna. Za vsako \u017eival pa je nato zapisano v katero vrsto spada. Za 13 \u017eivali vrsta ni podana.", "collection": "UCI", "version": "1.1", "year": 1990, "instances": 100, "missing": 1, "variables": 18, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/zivali-z-neznanimi.tab", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "categorical", "location": "zivali-z-neznanimi.tab", "size": 6531, "publication_status": 0, "tag": "biologija", "tags": [ "biologija" ], "title": "\u017divali - z neznanimi tipi" } ], [ [ "core", "zivali.tab" ], { "name": "zivali", "description": "Zbirka podatkov o \u017eivalih, njihovih lastnosti in pripadajo\u010di vrsti. Zbirka vsebuje 17 spremenljivk, ki opisujejo lastnosti \u017eivali: ali ima dlako, perje, vali jajca, daje mleko, leti, je vodna, je plenilec, ima zobe, hrbtenico, diha, je strupena, ima plavuti, noge, rep, je doma\u010da, majhna. Za vsako \u017eival pa je nato zapisano v katero vrsto spada.", "collection": "UCI", "version": "1.2", "year": 1990, "instances": 100, "missing": 0, "variables": 18, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/zivali.tab", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "categorical", "location": "zivali.tab", "size": 6713, "publication_status": 0, "tag": "biologija", "tags": [ "biologija" ], "title": "\u017divali" } ], [ [ "core", "zoo-finnish.tab" ], { "name": "zoo-finnish", "description": "A dataset created by Richard Forsyth contains seventeen binary-valued attributes that describe animals. Features include information about presence of hair, feathers and teeth, report if animal is aquatic or airborn, and alike. Animals are named and are classified into seven categories: amphibian, bird, fish, insect, invertaebrate, mammal, and reptile.", "collection": "UCI", "version": "1.1", "year": 1990, "instances": 100, "missing": 1, "variables": 18, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/zoo-finnish.tab", "domain": "Education", "language": "English", "target": "categorical", "location": "zoo-finnish.tab", "size": 9415, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Zoo data set - in Finnish" } ], [ [ "core", "zoo-missing.tab" ], { "name": "zoo-missing", "description": "ZA dataset created by Richard Forsyth contains seventeen binary-valued attributes that describe animals. Features include information about presence of hair, feathers and teeth, report if animal is aquatic or airborn, and alike. Animals are named and are classified into seven categories: amphibian, bird, fish, insect, invertaebrate, mammal, and reptile. This version includes some missing data.", "collection": "UCI", "version": "1.1", "year": 1990, "instances": 100, "missing": 1, "variables": 18, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/zoo-missing.tab", "domain": "Education", "language": "English", "target": "categorical", "location": "zoo-missing.tab", "size": 7102, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Zoo - with missing data" } ], [ [ "core", "zoo.tab" ], { "name": "zoo", "description": "A dataset created by Richard Forsyth contains seventeen binary-valued attributes that describe animals. Features include information about presence of hair, feathers and teeth, report if animal is aquatic or airborn, and alike. Animals are named and are classified into seven categories: amphibian, bird, fish, insect, invertaebrate, mammal, and reptile.", "collection": "UCI", "version": "1.4", "year": 1990, "instances": 100, "missing": 0, "variables": 18, "source": "UCI ML Repository", "url": "https://datasets.biolab.si/zoo.tab", "domain": null, "language": "English", "target": "categorical", "location": "zoo.tab", "size": 7038, "publication_status": 0, "tag": "biology", "tags": [ "biology" ], "title": "Zoo" } ], [ [ "core", "tibetan-dogs.xlsx" ], { "name": "tibetan-dogs", "description": "Seventy-two photos of dogs from Tibetan breeds, including the Tibetan Spaniel, Terrier, Mastiff, Lhasa Apso, and Shih Tzu. The dataset also includes images of Pekingese, as this breed is often confused with the Tibetan Spaniel. This fun dataset is ideal for classification, clustering, and showcasing breed prediction for random dog photos from the internet.", "collection": "", "version": "1.0", "year": 2024, "instances": 72, "missing": 0, "variables": 6, "source": "", "url": "https://datasets.biolab.si/tibetan-dogs.xlsx", "domain": null, "language": "English", "target": "categorical", "location": "tibetan-dogs.xlsx", "size": 11645, "publication_status": 0, "tag": "image analytics", "tags": [ "image analytics" ], "title": "Tibetan Dogs" } ], [ [ "core", "luxembourg-leaves.xlsx" ], { "name": "luxembourg-leaves", "description": "Collection of leaves from a park in Luxembourg, unclassified", "collection": "", "version": "1.5", "year": 2024, "instances": 79, "missing": 0, "variables": 2, "source": "", "url": "https://datasets.biolab.si/luxembourg-leaves.xlsx", "domain": "Education", "language": "English", "target": "none", "location": "luxembourg-leaves.xlsx", "size": 9941, "publication_status": 1, "tag": "image analytics", "tags": [ "image analytics" ], "title": "Luxembourg autumn leaves (without classes)" } ], [ [ "core", "luxembourg-leaves-classified.xlsx" ], { "name": "luxembourg-leaves-classified", "description": "Photos of leaves picked in a park in the Kirchberg quarter in Luxembourg City, autumn 2024.", "collection": "", "version": "1.3", "year": 2024, "instances": 79, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/luxembourg-leaves-classified.xlsx", "domain": "Education", "language": "English", "target": "categorical", "location": "luxembourg-leaves-classified.xlsx", "size": 10402, "publication_status": 0, "tag": "image analytics", "tags": [ "image analytics" ], "title": "Luxembourg autumn leaves" } ], [ [ "core", "luxembourg-leaves-train.tab" ], { "name": "luxembourg-leaves-train", "description": "Leaves picked up in the Luxembourg park, close to the center of the city, in autumn 2024, classified; training set", "collection": "", "version": "1.2", "year": 2024, "instances": 40, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/luxembourg-leaves-train.tab", "domain": "Education", "language": "English", "target": "categorical", "location": "luxembourg-leaves-train.tab", "size": 1671, "publication_status": 1, "tag": "image analytics", "tags": [ "image analytics" ], "title": "Luxembourg autumn leaves, classified - training set" } ], [ [ "core", "luxembourg-leaves-test.tab" ], { "name": "luxembourg-leaves-test", "description": "Leaves picked up in the Luxembourg park, close to the center of the city, classified; testing set", "collection": "", "version": "1.3", "year": 2024, "instances": 39, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/luxembourg-leaves-test.tab", "domain": "Education", "language": "English", "target": "categorical", "location": "luxembourg-leaves-test.tab", "size": 1625, "publication_status": 1, "tag": "image analytics", "tags": [ "image analytics" ], "title": "Luxembourg autumn leaves, classified - testing set" } ], [ [ "core", "UL Rector Elections 2009.xlsx" ], { "name": "UL Rector Elections 2009", "description": "The rector of the University of Ljubljana has a four-year mandate. Before introducing electronic voting, voting was organized on member faculties, which made it possible to analyze data per faculty. This data set contains the data for the year 2009, with four candidates of different profiles, which makes it interesting for cluster analysis.\n\nThe data contains English names and Slovenian acronyms of the faculties, the total number of votes cast and proportions of votes for each candidate", "collection": "", "version": "1.0", "year": 2024, "instances": 26, "missing": 0, "variables": 7, "source": "", "url": "https://datasets.biolab.si/UL Rector Elections 2009.xlsx", "domain": "Education", "language": "English", "target": "none", "location": "UL Rector Elections 2009.xlsx", "size": 14405, "publication_status": 0, "tag": "education", "tags": [ "education" ], "title": "Elections for Rector of the University of Ljubljana (2009)" } ], [ [ "core", "slovenian-surnames.tab" ], { "name": "slovenian-surnames", "description": "Approximate locations of sample of Slovenian households with 200 most common Slovenian surnames. Data was collected in late 1990s and is anonymized by randomly changing the location for up to 10 km.", "collection": "", "version": "1.0", "year": 2024, "instances": 103530, "missing": 0, "variables": 4, "source": "", "url": "https://datasets.biolab.si/slovenian-surnames.tab", "domain": "Education", "language": "English", "target": "none", "location": "slovenian-surnames.tab", "size": 5733584, "publication_status": 0, "tag": "geo", "tags": [ "geo" ], "title": "Slovenian Surnames" } ], [ [ "core", "slovenian surnames by municipality.tab" ], { "name": "slovenian surnames by municipality", "description": "Distribution of households with 200 most common Slovenian surnames in municipalities. Data was collected in late 1990s and is anonymized by randomly changing the location for up to 10 km.\n\nThe data is normalized by the number of households with the 200 most common surnames, not the total population. E.g. the data in each row sums to 100 percents.", "collection": "", "version": "1.0", "year": 2024, "instances": 192, "missing": 0, "variables": 208, "source": "", "url": "https://datasets.biolab.si/slovenian surnames by municipality.tab", "domain": "Education", "language": "English", "target": "none", "location": "slovenian surnames by municipality.tab", "size": 272408, "publication_status": 0, "tag": "geo", "tags": [ "geo" ], "title": "Slovenian Surnames by Municipality" } ], [ [ "core", "diabetes.xlsx" ], { "name": "diabetes", "description": "The Diabetes dataset contains clinical and biochemical data collected from diabetes patients to predict disease progression. It includes ten features: age, sex, body mass index (BMI), average blood pressure, and six blood serum measurements such as cholesterol and glucose levels, all standardized to zero mean and unit variance. The target variable represents a quantitative measure of diabetes progression one year after baseline, with higher values indicating a more severe disease progression. This dataset provides a snapshot of the relationships between various health indicators and the progression of diabetes.", "collection": "", "version": "1.1", "year": 2024, "instances": 442, "missing": 0, "variables": 11, "source": "", "url": "https://datasets.biolab.si/diabetes.xlsx", "domain": null, "language": "English", "target": "numeric", "location": "diabetes.xlsx", "size": 58012, "publication_status": 0, "tag": "medicine", "tags": [ "medicine" ], "title": "Diabetes", "references": [ "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) Least Angle Regression, Annals of Statistics 32(2): 407-499." ] } ], [ [ "core", "california_housing.xlsx" ], { "name": "california_housing", "description": "The California Housing Dataset contains information about California districts collected from the 1990 U.S. Census, aimed at predicting median house values. It includes 8 features: median income, housing median age, total rooms, total bedrooms, population, households, latitude, and longitude. The target variable represents the median house value for each district (in hundreds of thousands of dollars), with higher values indicating more expensive housing. The dataset provides a snapshot of the relationships between socio-economic and geographic factors and housing prices in California.", "collection": "", "version": "1.1", "year": 2024, "instances": 20640, "missing": 0, "variables": 9, "source": "", "url": "https://datasets.biolab.si/california_housing.xlsx", "domain": null, "language": "English", "target": "numeric", "location": "california_housing.xlsx", "size": 1693951, "publication_status": 0, "tag": "economy", "tags": [ "economy" ], "title": "California Housing", "references": [ "Pace, R. Kelley and Ronald Barry (1997) Sparse Spatial Autoregressions, Statistics and Probability Letters: 33(3):291-297." ] } ], [ [ "core", "advertising.xlsx" ], { "name": "advertising", "description": "The Advertising dataset contains information on advertising budgets for three media channels\u2014TV, Radio, and Newspaper\u2014and their corresponding Sales figures. The advertising budgets for TV, Radio, and Newspaper are measured in thousands of dollars, while Sales represent the number of products sold, measured in thousands of units. The dataset captures the relationship between advertising investments in these channels and the resulting product sales, providing a straightforward view of how different forms of media contribute to marketing outcomes.", "collection": "", "version": "1.1", "year": 2024, "instances": 200, "missing": 0, "variables": 4, "source": "", "url": "https://datasets.biolab.si/advertising.xlsx", "domain": null, "language": "English", "target": "numeric", "location": "advertising.xlsx", "size": 15920, "publication_status": 0, "tag": "economy", "tags": [ "economy" ], "title": "Advertising", "references": [ "James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An Introduction to Statistical Learning: With Applications in R. Springer." ] } ], [ [ "core", "naughty-nice.tab" ], { "name": "naughty-nice", "description": "A synthetic dataset of 30 children whom the Santa scores on six variables to determine whether the children were naughty or nice. It is intended as an educational data set for logistic regression.", "collection": "", "version": "1.0", "year": 2024, "instances": 30, "missing": 0, "variables": 8, "source": "", "url": "https://datasets.biolab.si/naughty-nice.tab", "domain": null, "language": "English", "target": "categorical", "location": "naughty-nice.tab", "size": 971, "publication_status": 0, "tag": "classification", "tags": [ "classification", "educational", "synthetic" ], "title": "Naughty or Nice" } ], [ [ "core", "palc\u030cki.xlsx" ], { "name": "palc\u030cki", "description": "Lastnosti pal\u010dkov; podatki za u\u010dno uro v okviru projekta Pumice", "collection": "", "version": "1.3", "year": 2025, "instances": 21, "missing": 1, "variables": 10, "source": "", "url": "https://datasets.biolab.si/palc\u030cki.xlsx", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "categorical", "location": "palc\u030cki.xlsx", "size": 10036, "publication_status": 0, "tag": "pumice", "tags": [ "pumice" ], "title": "Pal\u010dki", "references": [ "https://pumice.si/aktivnosti/palcki" ] } ], [ [ "core", "gnomes.xlsx" ], { "name": "gnomes", "description": "Gnomes; data for school activity", "collection": "", "version": "1.0", "year": 2025, "instances": 21, "missing": 1, "variables": 10, "source": "", "url": "https://datasets.biolab.si/gnomes.xlsx", "domain": "Education", "language": "English", "target": "categorical", "location": "gnomes.xlsx", "size": 10003, "publication_status": 0, "tag": null, "tags": [], "title": "Gnomes" } ], [ [ "core", "GDS4168.tab" ], { "name": "GDS4168", "description": "This dataset (GDS4168) contains gene expression data from the blood B cells of 41 patients with chronic lymphocytic leukemia (CLL) and 11 healthy individuals. CLL is a common type of blood cancer that mainly affects adults. The data can help researchers understand the molecular differences between healthy and cancerous B cells, providing insights into how CLL develops and progresses. The dataset was taken from the Gene Expression Omnibus and is available at NCBI GEO.", "collection": "Gene Expression Omnibus", "version": "1.0", "year": 2010, "instances": 52, "missing": 0, "variables": 16468, "source": "", "url": "https://datasets.biolab.si/GDS4168.tab", "domain": null, "language": "English", "target": "categorical", "location": "GDS4168.tab", "size": 5547285, "publication_status": 0, "tag": "gene expression", "tags": [ "gene expression", "medicine" ], "title": "Chronic Lymphocytic Leukemia Peripheral Blood B-Cell Expression Profiles (GDS4168)", "references": [ "Gutierrez A Jr, Tschumper RC, Wu X, Shanafelt TD et al. (2010) LEF-1 is a prosurvival factor in chronic lymphocytic leukemia and is expressed in the preleukemic state of monoclonal B-cell lymphocytosis. Blood 116(16):2975-83." ] } ], [ [ "core", "" ], { "name": "", "description": "", "collection": "", "version": "", "year": 0, "instances": 0, "missing": 0, "variables": 0, "source": "", "url": "https://datasets.biolab.si/", "domain": "core", "language": "English", "target": "", "location": "", "size": 0, "publication_status": 0, "tag": "gene expression", "tags": [ "gene expression", "medicine" ], "title": "" } ], [ [ "core", "vreme-prestolnice.xlsx" ], { "name": "vreme-prestolnice", "description": "Povpre\u010dna mese\u010dna temperatura in koli\u010dina padavin v evropskih prestolnicah.", "collection": "Pumice", "version": "1.2", "year": 2025, "instances": 40, "missing": 1, "variables": 28, "source": "", "url": "https://datasets.biolab.si/vreme-prestolnice.xlsx", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "none", "location": "vreme-prestolnice.xlsx", "size": 17463, "publication_status": 0, "tag": "geografija", "tags": [ "geografija" ], "title": "Temperatura in padavine v evropskih prestolnicah" } ], [ [ "core", "weather-capitals.xlsx" ], { "name": "weather-capitals", "description": "Average monthly temperatures and precipitation in European capitals", "collection": "Pumice", "version": "2.3", "year": 2025, "instances": 40, "missing": 1, "variables": 28, "source": "", "url": "https://datasets.biolab.si/weather-capitals.xlsx", "domain": "Education", "language": "English", "target": "none", "location": "weather-capitals.xlsx", "size": 17456, "publication_status": 0, "tag": "geography", "tags": [ "geography" ], "title": "Temperature and Precipitation in European Capitals" } ], [ [ "core", "matematika-nogomet.xlsx" ], { "name": "matematika-nogomet", "description": "Izmi\u0161ljeni podatki o spretnosti 11 u\u010dencev v matematiki in nogometu. Podatki so uporabni za pou\u010devanje gru\u010denja.", "collection": "Pumice", "version": "1.0", "year": 2025, "instances": 11, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/matematika-nogomet.xlsx", "domain": "Education", "language": "Sloven\u0161\u010dina", "target": "none", "location": "matematika-nogomet.xlsx", "size": 9223, "publication_status": 0, "tag": null, "tags": [], "title": "Matematika in nogomet" } ], [ [ "core", "mathematics-football.xlsx" ], { "name": "mathematics-football", "description": "A data set for a live demonstration of clustering in a classroom", "collection": "Pumice", "version": "1.0", "year": 2025, "instances": 11, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/mathematics-football.xlsx", "domain": "Education", "language": "English", "target": "none", "location": "mathematics-football.xlsx", "size": 9215, "publication_status": 0, "tag": null, "tags": [], "title": "Mathematics and Soccer" } ], [ [ "core", "mathe\u0301matiques-football.xlsx" ], { "name": "mathe\u0301matiques-football", "description": "Donn\u00e9es pour une d\u00e9monstration en direct du regroupement en classe", "collection": "Pumice", "version": "1.0", "year": 2025, "instances": 11, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/mathe\u0301matiques-football.xlsx", "domain": "Education", "language": "Fran\u00e7ais", "target": "none", "location": "mathe\u0301matiques-football.xlsx", "size": 9243, "publication_status": 0, "tag": null, "tags": [], "title": "Math\u00e9matiques et football" } ], [ [ "core", "Mathematik-Fu\u00dfball.xlsx" ], { "name": "Mathematik-Fu\u00dfball", "description": "Ein Datensatz f\u00fcr eine Live-Demonstration des Clusterings in einem Klassenzimmer.", "collection": "Pumice", "version": "1.1", "year": 2025, "instances": 11, "missing": 0, "variables": 3, "source": "", "url": "https://datasets.biolab.si/Mathematik-Fu\u00dfball.xlsx", "domain": "Education", "language": "Deutsch", "target": "none", "location": "Mathematik-Fu\u00dfball.xlsx", "size": 9227, "publication_status": 0, "tag": null, "tags": [], "title": "Mathematik und Fu\u00dfball" } ], [ [ "core", "zwerge.xlsx" ], { "name": "zwerge", "description": "Daten f\u00fcr eine Schulaktivit\u00e4t\"", "collection": "Pumice", "version": "1.0", "year": 2025, "instances": 21, "missing": 1, "variables": 10, "source": "", "url": "https://datasets.biolab.si/zwerge.xlsx", "domain": "Education", "language": "Deutsch", "target": "categorical", "location": "zwerge.xlsx", "size": 10071, "publication_status": 0, "tag": null, "tags": [], "title": "Zwerge" } ], [ [ "core", "nains.xlsx" ], { "name": "nains", "description": "Donn\u00e9es pour une activit\u00e9 scolaire", "collection": "Pumice", "version": "1.0", "year": 2025, "instances": 21, "missing": 1, "variables": 10, "source": "", "url": "https://datasets.biolab.si/nains.xlsx", "domain": "Education", "language": "Fran\u00e7ais", "target": "categorical", "location": "nains.xlsx", "size": 10083, "publication_status": 0, "tag": null, "tags": [], "title": "Nains" } ], [ [ "core", "wetter-hauptsta\u0308dten.xlsx" ], { "name": "wetter-hauptsta\u0308dten", "description": "Durchschnittliche monatliche Temperaturen und Niederschl\u00e4ge in europ\u00e4ischen Hauptst\u00e4dten", "collection": "Pumice", "version": "1.3", "year": 2025, "instances": 40, "missing": 1, "variables": 28, "source": "", "url": "https://datasets.biolab.si/wetter-hauptsta\u0308dten.xlsx", "domain": "Education", "language": "Deutsch", "target": "none", "location": "wetter-hauptsta\u0308dten.xlsx", "size": 17549, "publication_status": 0, "tag": "geography", "tags": [ "geography" ], "title": "Temperaturen und Niederschl\u00e4ge in europ\u00e4ischen Hauptst\u00e4dten" } ], [ [ "core", "me\u0301te\u0301o-capitals.xlsx" ], { "name": "me\u0301te\u0301o-capitals", "description": "Temp\u00e9ratures moyennes mensuelles et pr\u00e9cipitations dans les capitales europ\u00e9ennes", "collection": "Pumice", "version": "1.1", "year": 2025, "instances": 40, "missing": 1, "variables": 28, "source": "", "url": "https://datasets.biolab.si/me\u0301te\u0301o-capitals.xlsx", "domain": "Education", "language": "Fran\u00e7ais", "target": "none", "location": "me\u0301te\u0301o-capitals.xlsx", "size": 17571, "publication_status": 0, "tag": null, "tags": [], "title": "Temp\u00e9ratures et pr\u00e9cipitations dans les capitales europ\u00e9ennes" } ], [ [ "core", "tiere.tab" ], { "name": "tiere", "description": "Deutsche \u00dcbersetzung des Zoo-Datensatzes.", "collection": "Pumice", "version": "1.0", "year": 2025, "instances": 100, "missing": 0, "variables": 18, "source": "", "url": "https://datasets.biolab.si/tiere.tab", "domain": "Education", "language": "Deutsch", "target": "categorical", "location": "tiere.tab", "size": 8488, "publication_status": 0, "tag": null, "tags": [], "title": "Tiere" } ], [ [ "core", "animaux.tab" ], { "name": "animaux", "description": "Traduction fran\u00e7aise de l'ensemble des donn\u00e9es du Zoo.", "collection": "Pumice", "version": "1.0", "year": 2025, "instances": 100, "missing": 0, "variables": 18, "source": "", "url": "https://datasets.biolab.si/animaux.tab", "domain": "Education", "language": "Fran\u00e7ais", "target": "categorical", "location": "animaux.tab", "size": 8397, "publication_status": 0, "tag": null, "tags": [], "title": "Animaux" } ] ]