diff --git a/src/backend/generalTest/CorrelationCoefficient.cpp b/src/backend/generalTest/CorrelationCoefficient.cpp index c71a99348..e241c6268 100644 --- a/src/backend/generalTest/CorrelationCoefficient.cpp +++ b/src/backend/generalTest/CorrelationCoefficient.cpp @@ -1,437 +1,437 @@ /*************************************************************************** File : CorrelationCoefficient.cpp Project : LabPlot Description : Finding Correlation Coefficient on data provided -------------------------------------------------------------------- Copyright : (C) 2019 Devanshu Agarwal(agarwaldevanshu8@gmail.com) ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 51 Franklin Street, Fifth Floor, * * Boston, MA 02110-1301 USA * * * ***************************************************************************/ #include "CorrelationCoefficient.h" #include "GeneralTest.h" #include "kdefrontend/generalTest/CorrelationCoefficientView.h" #include "backend/spreadsheet/Spreadsheet.h" #include "backend/core/column/Column.h" #include "backend/lib/macros.h" #include #include #include #include #include #include #include #include #include #include #include #include extern "C" { #include "backend/nsl/nsl_stats.h" } -CorrelationCoefficient::CorrelationCoefficient(const QString &name) : GeneralTest (name, AspectType::CorrelationCoefficient) { +CorrelationCoefficient::CorrelationCoefficient(const QString& name) : GeneralTest(name, AspectType::CorrelationCoefficient) { } CorrelationCoefficient::~CorrelationCoefficient() { } void CorrelationCoefficient::performTest(Test test, bool categoricalVariable) { //QDEBUG("in perform test"); m_statsTable = ""; m_tooltips.clear(); m_correlationValue = 0; m_statisticValue.clear(); m_pValue.clear(); for (int i = 0; i < RESULTLINESCOUNT; i++) m_resultLine[i]->clear(); switch (test) { case CorrelationCoefficient::Test::Pearson: { m_currTestName = "

" + i18n("Pearson's r Correlation Test") + "

"; performPearson(categoricalVariable); break; } case CorrelationCoefficient::Test::Kendall: m_currTestName = "

" + i18n("Kendall's Rank Correlation Test") + "

"; performKendall(); break; case CorrelationCoefficient::Test::Spearman: { m_currTestName = "

" + i18n("Spearman Correlation Coefficient Test") + "

"; performSpearman(); break; } } emit changed(); } double CorrelationCoefficient::correlationValue() const{ return m_correlationValue; } QList CorrelationCoefficient::statisticValue() const{ return m_statisticValue; } QList CorrelationCoefficient::pValue() const{ return m_pValue; } /*************************************************************************************************************************** * Private Implementations * ************************************************************************************************************************/ /*********************************************Pearson r ******************************************************************/ //Formulaes are taken from https://www.statisticssolutions.com/correlation-pearson-kendall-spearman/ // variables: // N = total number of observations // sumColx = sum of values in colx // sumSqColx = sum of square of values in colx // sumColxColy = sum of product of values in colx and coly //TODO: support for col1 is categorical. //TODO: add automatic test //TODO: add tooltip for correlation value result //TODO: find p value void CorrelationCoefficient::performPearson(bool categoricalVariable) { //QDEBUG("in pearson"); if (m_columns.count() != 2) { printError("Select only 2 columns "); return; } if (categoricalVariable) { printLine(1, "currently categorical variable not supported", "blue"); return; } QString col1Name = m_columns[0]->name(); QString col2Name = m_columns[1]->name(); if (!isNumericOrInteger(m_columns[1])) { printError("Column " + col2Name + " should contain only numeric or interger values"); } int N = findCount(m_columns[0]); if (N != findCount(m_columns[1])) { printError("Number of data values in Column: " + col1Name + "and Column: " + col2Name + "are not equal"); return; } double sumCol1 = findSum(m_columns[0], N); double sumCol2 = findSum(m_columns[1], N); double sumSqCol1 = findSumSq(m_columns[0], N); double sumSqCol2 = findSumSq(m_columns[1], N); double sumCol12 = 0; for (int i = 0; i < N; i++) sumCol12 += m_columns[0]->valueAt(i) * m_columns[1]->valueAt(i); // printing table; // cell constructor structure; data, level, rowSpanCount, m_columnspanCount, isHeader; QList rowMajor; int level = 0; // horizontal header QString sigma = UTF8_QSTRING("Σ"); rowMajor.append(new Cell("", level, true)); rowMajor.append(new Cell("N", level, true, "Total Number of Observations")); rowMajor.append(new Cell(QString(sigma + "Scores"), level, true, "Sum of Scores in each column")); rowMajor.append(new Cell(QString(sigma + "Scores2"), level, true, "Sum of Squares of scores in each column")); rowMajor.append(new Cell(QString(sigma + "(" + UTF8_QSTRING("∏") + "Scores)"), level, true, "Sum of product of scores of both columns")); //data with vertical header. level++; rowMajor.append(new Cell(col1Name, level, true)); rowMajor.append(new Cell(N, level)); rowMajor.append(new Cell(sumCol1, level)); rowMajor.append(new Cell(sumSqCol1, level)); rowMajor.append(new Cell(sumCol12, level, false, "", 2, 1)); level++; rowMajor.append(new Cell(col2Name, level, true)); rowMajor.append(new Cell(N, level)); rowMajor.append(new Cell(sumCol2, level)); rowMajor.append(new Cell(sumSqCol2, level)); m_statsTable += getHtmlTable3(rowMajor); m_correlationValue = (N * sumCol12 - sumCol1*sumCol2) / sqrt((N * sumSqCol1 - gsl_pow_2(sumCol1)) * (N * sumSqCol2 - gsl_pow_2(sumCol2))); printLine(0, QString("Correlation Value is %1").arg(round(m_correlationValue)), "green"); } /***********************************************Kendall ******************************************************************/ // used knight algorithm for fast performance O(nlogn) rather than O(n^2) // http://adereth.github.io/blog/2013/10/30/efficiently-computing-kendalls-tau/ // TODO: Change date format type to original for numeric type; // TODO: add tooltips. // TODO: Compute tauB for ties. // TODO: find P Value from Z Value void CorrelationCoefficient::performKendall() { QDEBUG("in perform kendall") if (m_columns.count() != 2) { printError("Select only 2 columns "); return; } QString col1Name = m_columns[0]->name(); QString col2Name = m_columns[1]->name(); int N = findCount(m_columns[0]); if (N != findCount(m_columns[1])) { printError("Number of data values in Column: " + col1Name + "and Column: " + col2Name + "are not equal"); QDEBUG("unequal number of rows") return; } int col2Ranks[N]; if (isNumericOrInteger(m_columns[0]) || isNumericOrInteger(m_columns[1])) { if (isNumericOrInteger(m_columns[0]) && isNumericOrInteger(m_columns[1])) { for (int i = 0; i < N; i++) col2Ranks[int(m_columns[0]->valueAt(i)) - 1] = int(m_columns[1]->valueAt(i)); } else { printError(QString("Ranking System should be same for both Column: %1 and Column: %2
" "Hint: Check for data types of columns").arg(col1Name).arg(col2Name)); QDEBUG("ranking system not same") return; } } else { AbstractColumn::ColumnMode origCol1Mode = m_columns[0]->columnMode(); AbstractColumn::ColumnMode origCol2Mode = m_columns[1]->columnMode(); m_columns[0]->setColumnMode(AbstractColumn::Text); m_columns[1]->setColumnMode(AbstractColumn::Text); QMap ValueToRank; for (int i = 0; i < N; i++) { if (ValueToRank[m_columns[0]->textAt(i)] != 0) { printError("Currently ties are not supported"); m_columns[0]->setColumnMode(origCol1Mode); m_columns[1]->setColumnMode(origCol2Mode); return; } ValueToRank[m_columns[0]->textAt(i)] = i + 1; } for (int i = 0; i < N; i++) col2Ranks[i] = ValueToRank[m_columns[1]->textAt(i)]; m_columns[0]->setColumnMode(origCol1Mode); m_columns[1]->setColumnMode(origCol2Mode); } int nPossiblePairs = (N * (N - 1)) / 2; int nDiscordant = findDiscordants(col2Ranks, 0, N - 1); int nCorcordant = nPossiblePairs - nDiscordant; m_correlationValue = double(nCorcordant - nDiscordant) / nPossiblePairs; m_statisticValue.append((3 * (nCorcordant - nDiscordant)) / sqrt(N * (N- 1) * (2 * N + 5) / 2)); printLine(0 , QString("Number of Discordants are %1").arg(nDiscordant), "green"); printLine(1 , QString("Number of Concordant are %1").arg(nCorcordant), "green"); printLine(2 , QString("Tau a is %1").arg(round(m_correlationValue)), "green"); printLine(3 , QString("Z Value is %1").arg(round(m_statisticValue[0])), "green"); return; } /***********************************************Spearman ******************************************************************/ // All formulaes and symbols are taken from : https://www.statisticshowto.datasciencecentral.com/spearman-rank-correlation-definition-calculate/ void CorrelationCoefficient::performSpearman() { if (m_columns.count() != 2) { printError("Select only 2 columns "); return; } QString col1Name = m_columns[0]->name(); QString col2Name = m_columns[1]->name(); int N = findCount(m_columns[0]); if (N != findCount(m_columns[1])) { printError("Number of data values in Column: " + col1Name + "and Column: " + col2Name + "are not equal"); return; } QMap col1Ranks; convertToRanks(m_columns[0], N, col1Ranks); QMap col2Ranks; convertToRanks(m_columns[1], N, col2Ranks); double ranksCol1Mean = 0; double ranksCol2Mean = 0; // QString ranks1 = ""; // QString ranks2 = ""; for (int i = 0; i < N; i++) { ranksCol1Mean += col1Ranks[int(m_columns[0]->valueAt(i))]; ranksCol2Mean += col2Ranks[int(m_columns[1]->valueAt(i))]; // ranks1 += ", " + QString::number(col1Ranks[m_columns[0]->valueAt(i)]); // ranks2 += ", " + QString::number(col2Ranks[m_columns[1]->valueAt(i)]); } ranksCol1Mean = ranksCol1Mean / N; ranksCol2Mean = ranksCol2Mean / N; //QDEBUG("ranks 1 and ranks2 are " ); //QDEBUG(ranks1); //QDEBUG(ranks2); //QDEBUG("Mean ranks are " << ranksCol1Mean << ranksCol2Mean); double s12 = 0; double s1 = 0; double s2 = 0; for (int i = 0; i < N; i++) { double centeredRank_1 = col1Ranks[int(m_columns[0]->valueAt(i))] - ranksCol1Mean; double centeredRank_2 = col2Ranks[int(m_columns[1]->valueAt(i))] - ranksCol2Mean; s12 += centeredRank_1 * centeredRank_2; s1 += gsl_pow_2(centeredRank_1); s2 += gsl_pow_2(centeredRank_2); } s12 = s12 / N; s1 = s1 / N; s2 = s2 / N; //QDEBUG("s12, s1, s2 are " << s12 << " " << s1 << " " << s2); m_correlationValue = s12 / std::sqrt(s1 * s2); printLine(0, QString("Spearman Rank Correlation value is %1").arg(m_correlationValue), "green"); } /***********************************************Helper Functions******************************************************************/ int CorrelationCoefficient::findDiscordants(int *ranks, int start, int end) { if (start >= end) return 0; int mid = (start + end) / 2; int leftDiscordants = findDiscordants(ranks, start, mid); int rightDiscordants = findDiscordants(ranks, mid + 1, end); int len = end - start + 1; int leftLen = mid - start + 1; int rightLen = end - mid; int leftLenRemain = leftLen; int leftRanks[leftLen]; int rightRanks[rightLen]; for (int i = 0; i < leftLen; i++) leftRanks[i] = ranks[start + i]; for (int i = leftLen; i < leftLen + rightLen; i++) rightRanks[i - leftLen] = ranks[start + i]; int mergeDiscordants = 0; int i = 0, j = 0, k =0; while (i < len) { if (j >= leftLen) { ranks[start + i] = rightRanks[k]; k++; } else if (k >= rightLen) { ranks[start + i] = leftRanks[j]; j++; } else if (leftRanks[j] < rightRanks[k]) { ranks[start + i] = leftRanks[j]; j++; leftLenRemain--; } else if (leftRanks[j] > rightRanks[k]) { ranks[start + i] = rightRanks[k]; mergeDiscordants += leftLenRemain; k++; } i++; } return leftDiscordants + rightDiscordants + mergeDiscordants; } void CorrelationCoefficient::convertToRanks(const Column* col, int N, QMap &ranks) { if (!isNumericOrInteger(col)) return; //QDEBUG("in convert to ranks"); double* sortedList = new double[N]; for (int i = 0; i < N; i++) sortedList[i] = col->valueAt(i); std::sort(sortedList, sortedList + N, std::greater()); // QString debug_sortedList = ""; ranks.clear(); for (int i = 0; i < N; i++) { ranks[sortedList[i]] = i + 1; // debug_sortedList += ", " + QString::number(sortedList[i]); } //QDEBUG("sorted list is " << debug_sortedList); delete[] sortedList; } void CorrelationCoefficient::convertToRanks(const Column* col, QMap &ranks) { convertToRanks(col, findCount(col), ranks); } /***********************************************Virtual Functions******************************************************************/ QWidget* CorrelationCoefficient::view() const { if (!m_partView) { m_view = new CorrelationCoefficientView(const_cast(this)); m_partView = m_view; } return m_partView; } diff --git a/src/backend/generalTest/GeneralTest.cpp b/src/backend/generalTest/GeneralTest.cpp index 5a8b8c278..1b0e47557 100644 --- a/src/backend/generalTest/GeneralTest.cpp +++ b/src/backend/generalTest/GeneralTest.cpp @@ -1,557 +1,558 @@ /*************************************************************************** File : GeneralTest.cpp Project : LabPlot Description : Doing Hypothesis-Test on data provided -------------------------------------------------------------------- Copyright : (C) 2019 Devanshu Agarwal(agarwaldevanshu8@gmail.com) ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 51 Franklin Street, Fifth Floor, * * Boston, MA 02110-1301 USA * * * ***************************************************************************/ #include "GeneralTest.h" #include "kdefrontend/generalTest/HypothesisTestView.h" #include "backend/spreadsheet/Spreadsheet.h" #include "backend/core/column/Column.h" #include "backend/lib/macros.h" //#include //#include //#include #include #include //#include //#include //#include #include #include #include extern "C" { #include "backend/nsl/nsl_stats.h" } -GeneralTest::GeneralTest(const QString &name, const AspectType& type) : AbstractPart(name, type), + +GeneralTest::GeneralTest(const QString& name, const AspectType& type) : AbstractPart(name, type), m_summaryLayout(new QVBoxLayout()) { for (int i = 0; i < 10; i++) { m_resultLine[i] = new QLabel(); m_summaryLayout->addWidget(m_resultLine[i]); } } GeneralTest::~GeneralTest() { } void GeneralTest::setDataSourceType(DataSourceType type) { if (type != m_dataSourceType) m_dataSourceType = type; } GeneralTest::DataSourceType GeneralTest::dataSourceType() const { return m_dataSourceType; } void GeneralTest::setDataSourceSpreadsheet(Spreadsheet* spreadsheet) { m_dataSourceSpreadsheet = spreadsheet; for (auto* col : m_dataSourceSpreadsheet->children()) m_allColumns << col->name(); } QString GeneralTest::testName() { return m_currTestName; } QString GeneralTest::statsTable() { return m_statsTable; } QMap GeneralTest::tooltips() { return m_tooltips; } QVBoxLayout* GeneralTest::summaryLayout() { return m_summaryLayout; } void GeneralTest::setColumns(QStringList cols) { m_columns.clear(); Column* column = new Column("column"); for (QString col : cols) { if (!cols.isEmpty()) { column = m_dataSourceSpreadsheet->column(col); m_columns.append(column); } } delete[] column; } void GeneralTest::setColumns(const QVector &cols) { m_columns = cols; } /******************************************************************************************************************** * Protected functions implementations [Helper Functions] ********************************************************************************************************************/ QString GeneralTest::round(QVariant number, int precision) { if (number.userType() == QMetaType::Double || number.userType() == QMetaType::Float) { double multiplierPrecision = gsl_pow_int(10, precision); int tempNum = int(number.toDouble()*multiplierPrecision*10); if (tempNum % 10 < 5) return QString::number((tempNum/10) / multiplierPrecision); else return QString::number((tempNum/10 + 1) / multiplierPrecision); } return i18n("%1", number.toString()); } bool GeneralTest::isNumericOrInteger(const Column* column) { return (column->columnMode() == AbstractColumn::Numeric || column->columnMode() == AbstractColumn::Integer); } int GeneralTest::findCount(const Column *column) { int N = column->rowCount(); switch (column->columnMode()) { case (AbstractColumn::Numeric): case (AbstractColumn::Integer): { for (int i = 0; i < N; i++) if (std::isnan(column->valueAt(i))) { N = i; break; } break; } case (AbstractColumn::Month): case (AbstractColumn::Day): case (AbstractColumn::Text): { for (int i = 0; i < N; i++) if (column->textAt(i).isEmpty()) { N = i; break; } break; } case (AbstractColumn::DateTime): break; } return N; } double GeneralTest::findSum(const Column *column, int N) { if (!isNumericOrInteger(column)) return 0; if (N < 0) N = findCount(column); double sum = 0; for (int i = 0; i < N; i++) sum += column->valueAt(i); return sum; } double GeneralTest::findSumSq(const Column *column, int N) { if (!isNumericOrInteger(column)) return 0; if (N < 0) N = findCount(column); double sumSq = 0; for (int i = 0; i < N; i++) sumSq += gsl_pow_2(column->valueAt(i)); return sumSq; } double GeneralTest::findMean(const Column *column, int N) { if (!isNumericOrInteger(column)) return 0; if (N < 0) N = findCount(column); double sum = findSum(column, N); return sum / N; } double GeneralTest::findStd(const Column *column, int N, double mean) { if (!isNumericOrInteger(column)) return 0; double std = 0; for (int i = 0; i < N; i++) { double row = column->valueAt(i); std += gsl_pow_2( (row - mean)); } if (N > 1) std = std / (N-1); std = sqrt(std); return std; } double GeneralTest::findStd(const Column *column, int N) { if (!isNumericOrInteger(column)) return 0; if (N < 0) N = findCount(column); double mean = findMean(column, N); return findStd(column, N, mean); } GeneralTest::ErrorType GeneralTest::findStats(const Column* column, int& count, double& sum, double& mean, double& std) { count = findCount(column); sum = findSum(column, count); mean = findMean(column, count); std = findStd(column, count, mean); if (count < 1) return GeneralTest::ErrorEmptyColumn; return GeneralTest::NoError; } GeneralTest::ErrorType GeneralTest::findStatsPaired(const Column* column1, const Column* column2, int& count, double& sum, double& mean, double& std) { sum = 0; mean = 0; std = 0; int count1 = column1->rowCount(); int count2 = column2->rowCount(); count = qMin(count1, count2); double cell1, cell2; for (int i = 0; i < count; i++) { cell1 = column1->valueAt(i); cell2 = column2->valueAt(i); if (std::isnan(cell1) || std::isnan(cell2)) { if (std::isnan(cell1) && std::isnan(cell2)) count = i; else return GeneralTest::ErrorUnqualSize; break; } sum += cell1 - cell2; } if (count < 1) return GeneralTest::ErrorEmptyColumn; mean = sum / count; double row; for (int i = 0; i < count; i++) { cell1 = column1->valueAt(i); cell2 = column2->valueAt(i); row = cell1 - cell2; std += gsl_pow_2( (row - mean)); } if (count > 1) std = std / (count-1); std = sqrt(std); return GeneralTest::NoError; } void GeneralTest::countPartitions(Column* column, int& np, int& totalRows) { totalRows = column->rowCount(); np = 0; QString cellValue; QMap discoveredCategoricalVar; AbstractColumn::ColumnMode originalColMode = column->columnMode(); column->setColumnMode(AbstractColumn::Text); for (int i = 0; i < totalRows; i++) { cellValue = column->textAt(i); if (cellValue.isEmpty()) { totalRows = i; break; } if (discoveredCategoricalVar[cellValue]) continue; discoveredCategoricalVar[cellValue] = true; np++; } column->setColumnMode(originalColMode); } GeneralTest::ErrorType GeneralTest::findStatsCategorical(Column* column1, Column* column2, int n[], double sum[], double mean[], double std[], QMap& colName, const int& np, const int& totalRows) { Column* columns[] = {column1, column2}; for (int i = 0; i < np; i++) { n[i] = 0; sum[i] = 0; mean[i] = 0; std[i] = 0; } AbstractColumn::ColumnMode originalColMode = columns[0]->columnMode(); columns[0]->setColumnMode(AbstractColumn::Text); int partitionNumber = 1; for (int i = 0; i < totalRows; i++) { QString name = columns[0]->textAt(i); double value = columns[1]->valueAt(i); if (std::isnan(value)) { columns[0]->setColumnMode(originalColMode); return GeneralTest::ErrorUnqualSize; } if (colName[name] == 0) { colName[name] = partitionNumber; partitionNumber++; } n[colName[name]-1]++; sum[colName[name]-1] += value; } for (int i = 0; i < np; i++) mean[i] = sum[i] / n[i]; for (int i = 0; i < totalRows; i++) { QString name = columns[0]->textAt(i); double value = columns[1]->valueAt(i); std[colName[name]-1] += gsl_pow_2( (value - mean[colName[name]-1])); } for (int i = 0; i < np; i++) { if (n[i] > 1) std[i] = std[i] / (n[i] - 1); std[i] = sqrt(std[i]); } columns[0]->setColumnMode(originalColMode); if (isNumericOrInteger(columns[0])) { } return GeneralTest::NoError; } QString GeneralTest::getHtmlTable(int row, int column, QVariant* rowMajor) { if (row < 1 || column < 1) return QString(); QString table; table = "" "" " "; QString bg = "tg-0pky"; bool pky = true; QString element; table += " "; for (int j = 0; j < column; j++) { element = rowMajor[j].toString(); table += " "; } table += " "; if (pky) bg = "tg-0pky"; else bg = "tg-btxf"; pky = !pky; for (int i = 1; i < row; i++) { table += " "; QString element = round(rowMajor[i*column]); table += " "; for (int j = 1; j < column; j++) { element = round(rowMajor[i*column+j]); table += " "; } table += " "; if (pky) bg = "tg-0pky"; else bg = "tg-btxf"; pky = !pky; } table += "
" + i18n("%1", element) + "
" + i18n("%1", element) + "" + i18n("%1", element) + "
"; return table; } QString GeneralTest::getHtmlTable3(const QList& rowMajor) { m_tooltips.clear(); int rowMajorSize = rowMajor.size(); if (rowMajorSize == 0) return QString(); QString table; table = ""; table += ""; table += " "; int prevLevel = 0; for (int i = 0; i < rowMajorSize; i++) { Cell* currCell = rowMajor[i]; if (currCell->level != prevLevel) { table += " "; table += " "; prevLevel = currCell->level; } QString cellStartTag = ""; table += "
isHeader) { cellStartTag = "" + i18n("%1", currCell->data) + cellEndTag; if (!currCell->tooltip.isEmpty()) m_tooltips.insert(currCell->data, currCell->tooltip); } table += "
"; return table; } QString GeneralTest::getLine(const QString& msg, const QString& color) { return "

" + i18n("%1", msg) + "

"; } void GeneralTest::printLine(const int& index, const QString& msg, const QString& color) { if (index < 0 || index >= 10) return; m_resultLine[index]->setText(getLine(msg, color)); return; } void GeneralTest::printTooltip(const int &index, const QString &msg) { if (index < 0 || index >= 10) return; m_resultLine[index]->setToolTip(i18n("%1", msg)); } void GeneralTest::printError(const QString& errorMsg) { printLine(0, errorMsg, "red"); } /******************************************************************************************************************** * virtual functions implementations ********************************************************************************************************************/ /*! Saves as XML. */ void GeneralTest::save(QXmlStreamWriter* writer) const { writer->writeStartElement("GeneralTest"); writeBasicAttributes(writer); writeCommentElement(writer); writer->writeEndElement(); } /*! Loads from XML. */ bool GeneralTest::load(XmlStreamReader* reader, bool preview) { Q_UNUSED(preview); if (!readBasicAttributes(reader)) return false; return !reader->hasError(); } Spreadsheet *GeneralTest::dataSourceSpreadsheet() const { return m_dataSourceSpreadsheet; } bool GeneralTest::exportView() const { return true; } bool GeneralTest::printView() { return true; } bool GeneralTest::printPreview() const { return true; } /*! Constructs a primary view on me. This method may be called multiple times during the life time of an Aspect, or it might not get called at all. Aspects must not depend on the existence of a view for their operation. */ //QWidget* GeneralTest::view() const { // if (!m_partView) { // m_view = new HypothesisTestView(const_cast(this)); // m_partView = m_view; // } // return m_partView; //} /*! Returns a new context menu. The caller takes ownership of the menu. */ QMenu* GeneralTest::createContextMenu() { QMenu* menu = AbstractPart::createContextMenu(); // Q_ASSERT(menu); // emit requestProjectContextMenu(menu); return menu; } diff --git a/src/backend/generalTest/HypothesisTest.cpp b/src/backend/generalTest/HypothesisTest.cpp index 4ab760228..1fc5ee438 100644 --- a/src/backend/generalTest/HypothesisTest.cpp +++ b/src/backend/generalTest/HypothesisTest.cpp @@ -1,1143 +1,1143 @@ /*************************************************************************** File : HypothesisTest.cpp Project : LabPlot Description : Doing Hypothesis-Test on data provided -------------------------------------------------------------------- Copyright : (C) 2019 Devanshu Agarwal(agarwaldevanshu8@gmail.com) ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 51 Franklin Street, Fifth Floor, * * Boston, MA 02110-1301 USA * * * ***************************************************************************/ #include "HypothesisTest.h" #include "kdefrontend/generalTest/HypothesisTestView.h" #include "backend/spreadsheet/Spreadsheet.h" #include "backend/core/column/Column.h" #include "backend/lib/macros.h" #include #include #include #include #include #include #include #include #include #include #include extern "C" { #include "backend/nsl/nsl_stats.h" } -HypothesisTest::HypothesisTest(const QString &name) : GeneralTest (name, AspectType::HypothesisTest) { +HypothesisTest::HypothesisTest(const QString &name) : GeneralTest(name, AspectType::HypothesisTest) { } HypothesisTest::~HypothesisTest() { } -void HypothesisTest::setPopulationMean(QVariant m_populationMean) { - m_populationMean = m_populationMean.toDouble(); +void HypothesisTest::setPopulationMean(QVariant populationMean) { + m_populationMean = populationMean.toDouble(); } void HypothesisTest::setSignificanceLevel(QVariant alpha) { m_significanceLevel = alpha.toDouble(); } void HypothesisTest::performTest(Test test, bool categoricalVariable, bool equalVariance) { m_tailType = test.tail; m_pValue.clear(); m_statisticValue.clear(); m_statsTable = ""; m_tooltips.clear(); for (int i = 0; i < RESULTLINESCOUNT; i++) m_resultLine[i]->clear(); switch (test.subtype) { case HypothesisTest::Test::SubType::TwoSampleIndependent: { m_currTestName = "

" + i18n("Two Sample Independent Test") + "

"; performTwoSampleIndependentTest(test.type, categoricalVariable, equalVariance); break; } case HypothesisTest::Test::SubType::TwoSamplePaired: m_currTestName = "

" + i18n("Two Sample Paired Test") + "

"; performTwoSamplePairedTest(test.type); break; case HypothesisTest::Test::SubType::OneSample: { m_currTestName = "

" + i18n("One Sample Test") + "

"; performOneSampleTest(test.type); break; } case HypothesisTest::Test::SubType::OneWay: { m_currTestName = "

" + i18n("One Way Anova") + "

"; performOneWayAnova(); break; } case HypothesisTest::Test::SubType::TwoWay: { m_currTestName = "

" + i18n("Two Way Anova") + "

"; performTwoWayAnova(); break; } case HypothesisTest::Test::SubType::NoneSubType: break; } emit changed(); } void HypothesisTest::performLeveneTest(bool categoricalVariable) { m_pValue.clear(); m_statisticValue.clear(); m_statsTable = ""; m_tooltips.clear(); for (int i = 0; i < RESULTLINESCOUNT; i++) m_resultLine[i]->clear(); m_currTestName = "

" + i18n("Levene Test for Equality of Variance") + "

"; m_performLeveneTest(categoricalVariable); emit changed(); } QList& HypothesisTest::statisticValue(){ return m_statisticValue; } QList& HypothesisTest::pValue(){ return m_pValue; } /****************************************************************************** * Private Implementations * ****************************************************************************/ //TODO: backend of z test; //TODO: add tooltip to tables. (currently it is not possible to use with QTextDocument); //TODO: use https://www.gnu.org/software/gsl/doc/html/statistics.html for basic statistic calculations /**************************Two Sample Independent *************************************/ void HypothesisTest::performTwoSampleIndependentTest(HypothesisTest::Test::Type test, bool categoricalVariable, bool equalVariance) { if (m_columns.size() != 2) { printError("Inappropriate number of m_columns selected"); return; } int n[2]; double sum[2], mean[2], std[2]; QString col1Name = m_columns[0]->name(); QString col2Name = m_columns[1]->name(); if (!categoricalVariable && isNumericOrInteger(m_columns[0])) { for (int i = 0; i < 2; i++) { findStats(m_columns[i], n[i], sum[i], mean[i], std[i]); if (n[i] == 0) { printError("At least two values should be there in every column"); return; } - if (gsl_fcmp(std[i], 0., 1.e-16)) { + if (std[i] == 0.0) { printError(i18n("Standard Deviation of at least one column is equal to 0: last column is: %1", m_columns[i]->name())); return; } } } else { QMap colName; QString baseColName; int np; int totalRows; countPartitions(m_columns[0], np, totalRows); if (np != 2) { printError( i18n("Number of Categorical Variable in Column %1 is not equal to 2", m_columns[0]->name())); return; } if (isNumericOrInteger(m_columns[0])) baseColName = m_columns[0]->name(); ErrorType errorCode = findStatsCategorical(m_columns[0], m_columns[1], n, sum, mean, std, colName, np, totalRows); switch (errorCode) { case ErrorUnqualSize: { printError( i18n("Unequal size between Column %1 and Column %2", m_columns[0]->name(), m_columns[1]->name())); return; } case ErrorEmptyColumn: { printError("At least one of selected column is empty"); return; } case NoError: break; } QMapIterator i(colName); while (i.hasNext()) { i.next(); if (i.value() == 1) col1Name = baseColName + " " + i.key(); else col2Name = baseColName + " " + i.key(); } } QVariant rowMajor[] = {"", "N", "Sum", "Mean", "Std", col1Name, n[0], sum[0], mean[0], std[0], col2Name, n[1], sum[1], mean[1], std[1] }; m_statsTable = getHtmlTable(3, 5, rowMajor); for (int i = 0; i < 2; i++) { if (n[i] == 0) { printError("At least two values should be there in every column"); return; } - if (gsl_fcmp(std[i], 0., 1.e-16)) { + if (std[i] == 0.0) { printError( i18n("Standard Deviation of at least one column is equal to 0: last column is: %1", m_columns[i]->name())); return; } } QString testName; int df = 0; double sp = 0; switch (test) { case HypothesisTest::Test::Type::TTest: { testName = "T"; if (equalVariance) { df = n[0] + n[1] - 2; sp = qSqrt(((n[0]-1) * gsl_pow_2(std[0]) + (n[1]-1) * gsl_pow_2(std[1]) ) / df ); m_statisticValue.append((mean[0] - mean[1]) / (sp * qSqrt(1.0/n[0] + 1.0/n[1]))); printLine(9, "Assumption: Equal Variance b/w both population means"); } else { double temp_val; temp_val = gsl_pow_2( gsl_pow_2(std[0]) / n[0] + gsl_pow_2(std[1]) / n[1]); temp_val = temp_val / ( (gsl_pow_2( (gsl_pow_2(std[0]) / n[0]) ) / (n[0]-1)) + (gsl_pow_2( (gsl_pow_2(std[1]) / n[1]) ) / (n[1]-1))); df = qRound(temp_val); m_statisticValue.append((mean[0] - mean[1]) / (qSqrt( (gsl_pow_2(std[0])/n[0]) + (gsl_pow_2(std[1])/n[1])))); printLine(9, "Assumption: UnEqual Variance b/w both population means"); } printLine(8, "Assumption: Both Populations approximately follow normal distribution"); break; } case HypothesisTest::Test::Type::ZTest: { testName = "Z"; sp = qSqrt( ((n[0]-1) * gsl_pow_2(std[0]) + (n[1]-1) * gsl_pow_2(std[1])) / df); m_statisticValue.append((mean[0] - mean[1]) / (sp * qSqrt( 1.0 / n[0] + 1.0 / n[1]))); // m_pValue.append(gsl_cdf_gaussian_P(m_statisticValue, sp)); break; } case HypothesisTest::Test::Type::Anova: case HypothesisTest::Test::Type::NoneType: break; } m_currTestName = "

" + i18n("Two Sample Independent %1 Test for %2 vs %3", testName, col1Name, col2Name) + "

"; m_pValue.append(getPValue(test, m_statisticValue[0], col1Name, col2Name, (mean[0] - mean[1]), sp, df)); printLine(2, i18n("Significance level is %1", round(m_significanceLevel)), "blue"); printLine(4, i18n("%1 Value is %2 ", testName, round(m_statisticValue[0])), "green"); printTooltip(4, i18n("More is the |%1-value|, more safely we can reject the null hypothesis", testName)); printLine(5, i18n("P Value is %1 ", m_pValue[0]), "green"); printLine(6, i18n("Degree of Freedom is %1", df), "green"); printTooltip(6, i18n("Number of independent Pieces of information that went into calculating the estimate")); if (m_pValue[0] <= m_significanceLevel) printTooltip(5, i18n("We can safely reject Null Hypothesis for significance level %1", round(m_significanceLevel))); else printTooltip(5, i18n("There is a plausibility for Null Hypothesis to be true")); return; } /********************************Two Sample Paired ***************************************/ void HypothesisTest::performTwoSamplePairedTest(HypothesisTest::Test::Type test) { if (m_columns.size() != 2) { printError("Inappropriate number of m_columns selected"); return; } for (int i = 0; i < 2; i++) { if ( !isNumericOrInteger(m_columns[0])) { printError("select only m_columns with numbers"); return; } } int n; double sum, mean, std; ErrorType errorCode = findStatsPaired(m_columns[0], m_columns[1], n, sum, mean, std); switch (errorCode) { case ErrorUnqualSize: { printError("both m_columns are having different sizes"); return; } case ErrorEmptyColumn: { printError("m_columns are empty"); return; } case NoError: break; } QVariant rowMajor[] = {"", "N", "Sum", "Mean", "Std", "difference", n, sum, mean, std }; m_statsTable = getHtmlTable(2, 5, rowMajor); - if (gsl_fcmp(std, 0., 1.e-16)) { + if (std == 0.0) { printError("Standard deviation of the difference is 0"); return; } QString testName; int df = 0; switch (test) { case HypothesisTest::Test::Type::TTest: { m_statisticValue[0] = mean / (std / qSqrt(n)); df = n - 1; testName = "T"; printLine(6, i18n("Degree of Freedom is %1name(), i18n("%1", m_populationMean), mean, std, df)); m_currTestName = "

" + i18n("One Sample %1 Test for %2 vs %3", testName, m_columns[0]->name(), m_columns[1]->name()) + "

"; printLine(2, i18n("Significance level is %1 ", round(m_significanceLevel)), "blue"); printLine(4, i18n("%1 Value is %2 ", testName, round(m_statisticValue[0])), "green"); printLine(5, i18n("P Value is %1 ", m_pValue[0]), "green"); if (m_pValue[0] <= m_significanceLevel) printTooltip(5, i18n("We can safely reject Null Hypothesis for significance level %1", m_significanceLevel)); else printTooltip(5, i18n("There is a plausibility for Null Hypothesis to be true")); return; } /******************************** One Sample ***************************************/ void HypothesisTest::performOneSampleTest(HypothesisTest::Test::Type test) { if (m_columns.size() != 1) { printError("Inappropriate number of m_columns selected"); return; } if ( !isNumericOrInteger(m_columns[0])) { printError("select only m_columns with numbers"); return; } int n; double sum, mean, std; ErrorType errorCode = findStats(m_columns[0], n, sum, mean, std); switch (errorCode) { case ErrorEmptyColumn: { printError("column is empty"); return; } case NoError: break; case ErrorUnqualSize: { return; } } QVariant rowMajor[] = {"", "N", "Sum", "Mean", "Std", m_columns[0]->name(), n, sum, mean, std }; m_statsTable = getHtmlTable(2, 5, rowMajor); - if (gsl_fcmp(std, 0., 1.e-16)) { + if (std == 0.0) { printError("Standard deviation is 0"); return; } QString testName; int df = 0; switch (test) { case HypothesisTest::Test::Type::TTest: { testName = "T"; m_statisticValue.append((mean - m_populationMean) / (std / qSqrt(n))); df = n - 1; printLine(6, i18n("Degree of Freedom is %1", df), "blue"); break; } case HypothesisTest::Test::Type::ZTest: { testName = "Z"; df = 0; m_statisticValue.append((mean - m_populationMean) / (std / qSqrt(n))); break; } case HypothesisTest::Test::Type::Anova: case HypothesisTest::Test::Type::NoneType: break; } m_pValue.append(getPValue(test, m_statisticValue[0], m_columns[0]->name(), i18n("%1",m_populationMean), mean - m_populationMean, std, df)); m_currTestName = "

" + i18n("One Sample %1 Test for %2", testName, m_columns[0]->name()) + "

"; printLine(2, i18n("Significance level is %1", round(m_significanceLevel)), "blue"); printLine(4, i18n("%1 Value is %2", testName, round(m_statisticValue[0])), "green"); printLine(5, i18n("P Value is %1", m_pValue[0]), "green"); if (m_pValue[0] <= m_significanceLevel) printTooltip(5, i18n("We can safely reject Null Hypothesis for significance level %1", m_significanceLevel)); else printTooltip(5, i18n("There is a plausibility for Null Hypothesis to be true")); return; } /*************************************One Way Anova***************************************/ // all standard variables and formulas are taken from this wikipedia page: // https://en.wikipedia.org/wiki/One-way_analysis_of_variance // b stands for b/w groups // w stands for within groups // np is number of partition i.e., number of classes void HypothesisTest::performOneWayAnova() { int np, totalRows; countPartitions(m_columns[0], np, totalRows); int* ni = new int[np]; double* sum = new double[np]; double* mean = new double[np]; double* std = new double[np]; QString* colNames = new QString[np]; QMap classnameToIndex; QString baseColName; if (isNumericOrInteger(m_columns[0])) baseColName = m_columns[0]->name(); findStatsCategorical(m_columns[0], m_columns[1], ni, sum, mean, std, classnameToIndex, np, totalRows); double yBar = 0; // overall mean double sB = 0; // sum of squares of (mean - overall_mean) between the groups int fB = 0; // degree of freedom between the groups double msB = 0; // mean sum of squares between the groups double sW = 0; // sum of squares of (value - mean of group) within the groups int fW = 0; // degree of freedom within the group double msW = 0; // mean sum of squares within the groups // now finding mean of each group; for (int i = 0; i < np; i++) yBar += mean[i]; yBar = yBar / np; for (int i = 0; i < np; i++) { sB += ni[i] * gsl_pow_2( ( mean[i] - yBar)); if (ni[i] > 1) sW += gsl_pow_2( std[i])*(ni[i] - 1); else sW += gsl_pow_2( std[i]); fW += ni[i] - 1; } fB = np - 1; msB = sB / fB; msW = sW / fW; m_statisticValue.append(msB / msW); m_pValue.append(nsl_stats_fdist_p(m_statisticValue[0], static_cast(np-1), fW)); QMapIterator i(classnameToIndex); while (i.hasNext()) { i.next(); colNames[i.value()-1] = baseColName + " " + i.key(); } // now printing the statistics and result; int rowCount = np + 1, columnCount = 5; QVariant* rowMajor = new QVariant[rowCount*columnCount]; // header data; rowMajor[0] = ""; rowMajor[1] = "Ni"; rowMajor[2] = "Sum"; rowMajor[3] = "Mean"; rowMajor[4] = "Std"; // table data for (int row_i = 1; row_i < rowCount ; row_i++) { rowMajor[row_i*columnCount] = colNames[row_i - 1]; rowMajor[row_i*columnCount + 1] = ni[row_i - 1]; rowMajor[row_i*columnCount + 2] = sum[row_i - 1]; rowMajor[row_i*columnCount + 3] = mean[row_i - 1]; rowMajor[row_i*columnCount + 4] = std[row_i - 1]; } m_statsTable = "

" + i18n("Group Summary Statistics") + "

"; m_statsTable += getHtmlTable(rowCount, columnCount, rowMajor); m_statsTable += getLine(""); m_statsTable += getLine(""); m_statsTable += "

" + i18n("Grand Summary Statistics") + "

"; m_statsTable += getLine(""); m_statsTable += getLine(i18n("Overall Mean is %1", round(yBar))); rowCount = 4; columnCount = 3; rowMajor->clear(); rowMajor[0] = ""; rowMajor[1] = "Between Groups"; rowMajor[2] = "Within Groups"; int baseIndex = 0; baseIndex = 1 * columnCount; rowMajor[baseIndex + 0] = "Sum of Squares"; rowMajor[baseIndex + 1] = sB; rowMajor[baseIndex + 2] = sW; baseIndex = 2 * columnCount; rowMajor[baseIndex + 0] = "Degree of Freedom"; rowMajor[baseIndex + 1] = fB; rowMajor[baseIndex + 2] = fW; baseIndex = 3 * columnCount; rowMajor[baseIndex + 0] = "Mean Square Value"; rowMajor[baseIndex + 1] = msB; rowMajor[baseIndex + 2] = msW; m_statsTable += getHtmlTable(rowCount, columnCount, rowMajor); delete[] ni; delete[] sum; delete[] mean; delete[] std; delete[] colNames; printLine(1, i18n("F Value is %1", round(m_statisticValue[0])), "green"); printLine(2, i18n("P Value is %1 ", m_pValue[0]), "green"); if (m_pValue[0] <= m_significanceLevel) printTooltip(2, i18n("We can safely reject Null Hypothesis for significance level %1", m_significanceLevel)); else printTooltip(2, i18n("There is a plausibility for Null Hypothesis to be true")); return; } /*************************************Two Way Anova***************************************/ // all formulas and symbols are taken from: http://statweb.stanford.edu/~susan/courses/s141/exanova.pdf //TODO: suppress warning of variable length array are a C99 feature. //TODO: add assumptions verification option //TODO: add tail option (if needed) void HypothesisTest::performTwoWayAnova() { int np_a, totalRows_a; int np_b, totalRows_b; countPartitions(m_columns[0], np_a, totalRows_a); countPartitions(m_columns[1], np_b, totalRows_b); double groupMean[np_a][np_b]; int replicates[np_a][np_b]; for (int i = 0; i < np_a; i++) for (int j = 0; j < np_b; j++) { groupMean[i][j] = 0; replicates[i][j] = 0; } if (totalRows_a != totalRows_b) { printError("There is missing data in at least one of the rows"); return; } QMap catToNumber_a; QMap catToNumber_b; int partitionNumber_a = 1; int partitionNumber_b = 1; for (int i = 0; i < totalRows_a; i++) { QString name_a = m_columns[0]->textAt(i); QString name_b = m_columns[1]->textAt(i); double value = m_columns[2]->valueAt(i); if (catToNumber_a[name_a] == 0) { catToNumber_a[name_a] = partitionNumber_a; partitionNumber_a++; } if (catToNumber_b[name_b] == 0) { catToNumber_b[name_b] = partitionNumber_b; partitionNumber_b++; } groupMean[catToNumber_a[name_a] - 1][catToNumber_b[name_b] - 1] += value; replicates[catToNumber_a[name_a] - 1][catToNumber_b[name_b] - 1] += 1; } int replicate = replicates[0][0]; for (int i = 0; i < np_a; i++) for (int j = 0; j < np_b; j++) { if (replicates[i][j] == 0) { printError("Dataset should have at least one data value corresponding to each feature combination"); return; } if (replicates[i][j] != replicate) { printError("Number of experiments perfomed for each combination of levels
" "between Independet Var.1 and Independent Var.2 must be equal"); return; } groupMean[i][j] /= replicates[i][j]; } double ss_within = 0; for (int i = 0; i < totalRows_a; i++) { QString name_a = m_columns[0]->textAt(i); QString name_b = m_columns[1]->textAt(i); double value = m_columns[2]->valueAt(i); ss_within += gsl_pow_2(value - groupMean[catToNumber_a[name_a] - 1][catToNumber_b[name_b] - 1]); } int df_within = (replicate - 1) * np_a * np_b; double ms_within = ss_within / df_within; double* mean_a = new double[np_a]; double* mean_b = new double[np_b]; for (int i = 0; i < np_a; i++) { for (int j = 0; j < np_b; j++) { mean_a[i] += groupMean[i][j] / np_b; mean_b[j] += groupMean[i][j] / np_a; } } double mean = 0; for (int i = 0; i < np_a; i++) mean += mean_a[i] / np_a; double ss_a = 0; for (int i = 0; i < np_a; i++) ss_a += gsl_pow_2(mean_a[i] - mean); ss_a *= replicate * np_b; int df_a = np_a - 1; double ms_a = ss_a / df_a; double ss_b = 0; for (int i = 0; i < np_b; i++) ss_b += gsl_pow_2(mean_b[i] - mean); ss_b *= replicate * np_a; int df_b = np_b - 1; double ms_b = ss_b / df_b; double ss_interaction = 0; for (int i = 0; i < np_a; i++) for (int j = 0; j < np_b; j++) ss_interaction += gsl_pow_2(groupMean[i][j] - mean_a[i] - mean_b[j] + mean); ss_interaction *= replicate; int df_interaction = (np_a - 1) * (np_b - 1); double ms_interaction = ss_interaction / df_interaction; QString* partitionNames_a = new QString[np_a]; QString* partitionNames_b = new QString[np_b]; QMapIterator itr_a(catToNumber_a); while (itr_a.hasNext()) { itr_a.next(); partitionNames_a[itr_a.value()-1] = itr_a.key(); } QMapIterator itr_b(catToNumber_b); while (itr_b.hasNext()) { itr_b.next(); partitionNames_b[itr_b.value()-1] = itr_b.key(); } // printing table; // cell constructor structure; data, level, rowSpanCount, m_columnspanCount, isHeader; QList rowMajor; rowMajor.append(new Cell("", 0, true, "", 2, 1)); for (int i = 0; i < np_b; i++) rowMajor.append(new Cell(partitionNames_b[i], 0, true, "", 1, 2)); rowMajor.append(new Cell("Mean", 0, true, "", 2)); for (int i = 0; i < np_b; i++) { rowMajor.append(new Cell("Mean", 1, true)); rowMajor.append(new Cell("Replicate", 1, true)); } int level = 2; for (int i = 0; i < np_a; i++) { rowMajor.append(new Cell(partitionNames_a[i], level, true)); for (int j = 0; j < np_b; j++) { rowMajor.append(new Cell(round(groupMean[i][j]), level)); rowMajor.append(new Cell(replicates[i][j], level)); } rowMajor.append(new Cell(round(mean_a[i]), level)); level++; } rowMajor.append(new Cell("Mean", level, true)); for (int i = 0; i < np_b; i++) rowMajor.append(new Cell(round(mean_b[i]), level, false, "", 1, 2)); rowMajor.append(new Cell(round(mean), level)); m_statsTable = "

" + i18n("Contingency Table") + "

"; m_statsTable += getHtmlTable3(rowMajor); m_statsTable += "
"; m_statsTable += "

" + i18n("results table") + "

"; rowMajor.clear(); level = 0; rowMajor.append(new Cell("", level, true)); rowMajor.append(new Cell("SS", level, true)); rowMajor.append(new Cell("DF", level, true, "degree of freedom")); rowMajor.append(new Cell("MS", level, true)); level++; rowMajor.append(new Cell(m_columns[0]->name(), level, true)); rowMajor.append(new Cell(round(ss_a), level)); rowMajor.append(new Cell(df_a, level)); rowMajor.append(new Cell(round(ms_a), level)); level++; rowMajor.append(new Cell(m_columns[1]->name(), level, true)); rowMajor.append(new Cell(round(ss_b), level)); rowMajor.append(new Cell(df_b, level)); rowMajor.append(new Cell(round(ms_b), level)); level++; rowMajor.append(new Cell("Interaction", level, true)); rowMajor.append(new Cell(round(ss_interaction), level)); rowMajor.append(new Cell(df_interaction, level)); rowMajor.append(new Cell(round(ms_interaction), level)); level++; rowMajor.append(new Cell("Within", level, true)); rowMajor.append(new Cell(round(ss_within), level)); rowMajor.append(new Cell(df_within, level)); rowMajor.append(new Cell(round(ms_within), level)); m_statsTable += getHtmlTable3(rowMajor); double fValue_a = ms_a / ms_within; double fValue_b = ms_b / ms_within; double fValue_interaction = ms_interaction / ms_within; double m_pValue_a = nsl_stats_fdist_p(fValue_a, static_cast(np_a - 1), df_a); double m_pValue_b = nsl_stats_fdist_p(fValue_b, static_cast(np_b - 1), df_b); printLine(0, "F(df" + m_columns[0]->name() + ", dfwithin) is " + round(fValue_a), "blue"); printLine(1, "F(df" + m_columns[1]->name() + ", dfwithin) is " + round(fValue_b), "blue"); printLine(2, "F(dfinteraction, dfwithin) is " + round(fValue_interaction), "blue"); printLine(4, "P(df" + m_columns[0]->name() + ", dfwithin) is " + round(m_pValue_a), "blue"); printLine(5, "P(df" + m_columns[1]->name() + ", dfwithin) is " + round(m_pValue_b), "blue"); // printLine(2, "P(dfinteraction, dfwithin) is " + round(fValue_interaction), "blue"); m_statisticValue.append(fValue_a); m_statisticValue.append(fValue_b); m_statisticValue.append(fValue_interaction); m_pValue.append(m_pValue_a); m_pValue.append(m_pValue_b); delete[] mean_a; delete[] mean_b; delete[] partitionNames_a; delete[] partitionNames_b; return; } /**************************************Levene Test****************************************/ // Some reference to local variables. // np = number of partitions // df = degree of fredom // totalRows = total number of rows in column // these variables are taken from: https://en.wikipedia.org/wiki/Levene%27s_test // yiBar = mean of ith group; // Zij = |Yij - yiBar| // ziBar = mean of Zij for group i // ziBarBar = mean for all zij // ni = number of elements in group i void HypothesisTest::m_performLeveneTest(bool categoricalVariable) { if (m_columns.size() != 2) { printError("Inappropriate number of m_columns selected"); return; } int np = 0; int n = 0; if (!categoricalVariable && isNumericOrInteger(m_columns[0])) np = m_columns.size(); else countPartitions(m_columns[0], np, n); if (np < 2) { printError("Select at least two m_columns / classes"); return; } double* yiBar = new double[np]; double* ziBar = new double[np]; double ziBarBar = 0; double* ni = new double[np]; for (int i = 0; i < np; i++) { yiBar[i] = 0; ziBar[i] = 0; ni[i] = 0; } double fValue; int df = 0; int totalRows = 0; QString* colNames = new QString[np]; if (!categoricalVariable && isNumericOrInteger(m_columns[0])) { totalRows = m_columns[0]->rowCount(); double value = 0; for (int j = 0; j < totalRows; j++) { int numberNaNCols = 0; for (int i = 0; i < np; i++) { value = m_columns[i]->valueAt(j); if (std::isnan(value)) { numberNaNCols++; continue; } yiBar[i] += value; ni[i]++; n++; } if (numberNaNCols == np) { totalRows = j; break; } } for (int i = 0; i < np; i++) { if (ni[i] > 0) yiBar[i] = yiBar[i] / ni[i]; else { printError("One of the selected m_columns is empty
" "or have choosen Independent Var.1 wrongly"); return; } } for (int j = 0; j < totalRows; j++) { for (int i = 0; i < np; i++) { value = m_columns[i]->valueAt(j); if (!(std::isnan(value))) ziBar[i] += fabs(value - yiBar[i]); } } for (int i = 0; i < np; i++) { ziBarBar += ziBar[i]; if (ni[i] > 0) ziBar[i] = ziBar[i] / ni[i]; } ziBarBar = ziBarBar / n; double numberatorValue = 0; double denominatorValue = 0; for (int j = 0; j < totalRows; j++) { for (int i = 0; i < np; i++) { value = m_columns[i]->valueAt(j); if (!(std::isnan(value))) { double zij = fabs(value - yiBar[i]); denominatorValue += gsl_pow_2( (zij - ziBar[i])); } } } - if (gsl_fcmp(denominatorValue, 0. ,1.e-16)) { + if (denominatorValue == 0.0) { printError( i18n("Denominator value is %1", denominatorValue)); return; } for (int i = 0; i < np; i++) { colNames[i] = m_columns[i]->name(); numberatorValue += ni[i]*gsl_pow_2( (ziBar[i]-ziBarBar)); } fValue = ((n - np) / (np - 1)) * (numberatorValue / denominatorValue); } else { QMap classnameToIndex; AbstractColumn::ColumnMode originalColMode = m_columns[0]->columnMode(); m_columns[0]->setColumnMode(AbstractColumn::Text); int partitionNumber = 1; QString name; double value; int classIndex; for (int j = 0; j < n; j++) { name = m_columns[0]->textAt(j); value = m_columns[1]->valueAt(j); if (std::isnan(value)) { n = j; break; } if (classnameToIndex[name] == 0) { classnameToIndex[name] = partitionNumber; partitionNumber++; } classIndex = classnameToIndex[name]-1; ni[classIndex]++; yiBar[classIndex] += value; } for (int i = 0; i < np; i++) { if (ni[i] > 0) yiBar[i] = yiBar[i] / ni[i]; else { printError("One of the selected m_columns is empty
" "or have choosen Independent Var.1 wrongly"); m_columns[0]->setColumnMode(originalColMode); return; } } for (int j = 0; j < n; j++) { name = m_columns[0]->textAt(j); value = m_columns[1]->valueAt(j); classIndex = classnameToIndex[name] - 1; ziBar[classIndex] += fabs(value - yiBar[classIndex]); } for (int i = 0; i < np; i++) { ziBarBar += ziBar[i]; ziBar[i] = ziBar[i] / ni[i]; } ziBarBar = ziBarBar / n; double numberatorValue = 0; double denominatorValue = 0; for (int j = 0; j < n; j++) { name = m_columns[0]->textAt(j); value = m_columns[1]->valueAt(j); classIndex = classnameToIndex[name] - 1; double zij = fabs(value - yiBar[classIndex]); denominatorValue += gsl_pow_2( (zij - ziBar[classIndex])); } for (int i = 0; i < np; i++) numberatorValue += ni[i]*gsl_pow_2( (ziBar[i]-ziBarBar)); - if (gsl_fcmp(denominatorValue, 0., 1.e-16)) { + if (denominatorValue == 0.0) { printError( "number of data points is less or than equal to number of categorical variables"); m_columns[0]->setColumnMode(originalColMode); return; } fValue = ((n - np) / (np - 1)) * (numberatorValue / denominatorValue); QMapIterator i(classnameToIndex); while (i.hasNext()) { i.next(); colNames[i.value()-1] = m_columns[0]->name() + " " + i.key(); } m_columns[0]->setColumnMode(originalColMode); } df = n - np; // now making the stats table. int rowCount = np+1; int columnCount = 4; QVariant* rowMajor = new QVariant[rowCount*columnCount]; // header data; rowMajor[0] = ""; rowMajor[1] = "Ni"; rowMajor[2] = "yiBar"; rowMajor[3] = "ziBar"; // table data for (int row_i = 1; row_i < rowCount; row_i++) { rowMajor[row_i*columnCount] = colNames[row_i-1]; rowMajor[row_i*columnCount + 1] = ni[row_i-1]; rowMajor[row_i*columnCount + 2] = yiBar[row_i-1]; rowMajor[row_i*columnCount + 3] = ziBar[row_i-1]; } m_statsTable = getHtmlTable(rowCount, columnCount, rowMajor); delete[] rowMajor; delete[] yiBar; delete[] ziBar; delete[] ni; m_pValue.append(nsl_stats_fdist_p(fValue, static_cast(np-1), df)); printLine(0, "Null Hypothesis: Variance is equal between all classes", "blue"); printLine(1, "Alternate Hypothesis: Variance is not equal in at-least one pair of classes", "blue"); printLine(2, i18n("Significance level is %1", round(m_significanceLevel)), "blue"); printLine(4, i18n("F Value is %1 ", round(fValue)), "green"); printLine(5, i18n("P Value is %1 ", m_pValue[0]), "green"); printLine(6, i18n("Degree of Freedom is %1", df), "green"); if (m_pValue[0] <= m_significanceLevel) { printTooltip(5, i18n("We can safely reject Null Hypothesis for significance level %1", m_significanceLevel)); printLine(8, "Requirement for homogeneity is not met", "red"); } else { printTooltip(5, i18n("There is a plausibility for Null Hypothesis to be true")); printLine(8, "Requirement for homogeneity is met", "green"); } m_statisticValue.append(fValue); return; } //TODO change ("⋖") symbol to ("<"), currently macro UTF8_QSTRING is not working properly if used "<" symbol; // TODO: check for correctness between: for TestZ with TailTwo // m_pValue.append(2*gsl_cdf_tdist_P(value, df) v/s // m_pValue.append(gsl_cdf_tdis_P(value, df) + gsl_cdf_tdis_P(-value, df); double HypothesisTest::getPValue(const HypothesisTest::Test::Type& test, double& value, const QString& col1Name, const QString& col2Name, const double mean, const double sp, const int df) { switch (test) { case HypothesisTest::Test::Type::TTest: { switch (m_tailType) { case HypothesisTest::Test::Tail::Negative: { m_pValue.append(gsl_cdf_tdist_P(value, df)); printLine(0, i18n("Null Hypothesis: Population mean of %1 %2 Population mean of %3", col1Name, UTF8_QSTRING("≥"), col2Name), "blue"); printLine(1, i18n("Alternate Hypothesis: Population mean of %1 %2 Population mean of %3", col1Name, UTF8_QSTRING("⋖"), col2Name), "blue"); break; } case HypothesisTest::Test::Tail::Positive: { value *= -1; m_pValue.append(gsl_cdf_tdist_P(value, df)); printLine(0, i18n("Null Hypothesis: Population mean of %1 %2 Population mean of %3", col1Name, UTF8_QSTRING("≤"), col2Name), "blue"); printLine(1, i18n("Alternate Hypothesis: Population mean of %1 %2 Population mean of %3", col1Name, UTF8_QSTRING(">"), col2Name), "blue"); break; } case HypothesisTest::Test::Tail::Two: { m_pValue.append(2.*gsl_cdf_tdist_P(-fabs(value), df)); printLine(0, i18n("Null Hypothesis: Population mean of %1 %2 Population mean of %3", col1Name, UTF8_QSTRING("="), col2Name), "blue"); printLine(1, i18n("Alternate Hypothesis: Population mean of %1 %2 Population mean of %3", col1Name, UTF8_QSTRING("≠"), col2Name), "blue"); break; } } break; } case HypothesisTest::Test::Type::ZTest: { switch (m_tailType) { case HypothesisTest::Test::Tail::Negative: { m_pValue.append(gsl_cdf_gaussian_P(value - mean, sp)); printLine(0, i18n("Null Hypothesis: Population mean of %1 %2 Population mean of %3 ", col1Name, UTF8_QSTRING("≥"), col2Name), "blue"); printLine(1, i18n("Alternate Hypothesis: Population mean of %1 %2 Population mean of %3 ", col1Name, UTF8_QSTRING("⋖"), col2Name), "blue"); break; } case HypothesisTest::Test::Tail::Positive: { value *= -1; m_pValue.append(nsl_stats_tdist_p(value - mean, sp)); printLine(0, i18n("Null Hypothesis: Population mean of %1 %2 Population mean of %3 ", col1Name, UTF8_QSTRING("≤"), col2Name), "blue"); printLine(1, i18n("Alternate Hypothesis: Population mean of %1 %2 Population mean of %3 ", col1Name, UTF8_QSTRING(">"), col2Name), "blue"); break; } case HypothesisTest::Test::Tail::Two: { m_pValue.append(2.*gsl_cdf_gaussian_P(value - mean, sp)); printLine(0, i18n("Null Hypothesis: Population mean of %1 %2 Population mean of %3 ", col1Name, UTF8_QSTRING("="), col2Name), "blue"); printLine(1, i18n("Alternate Hypothesis: Population mean of %1 %2 Population mean of %3 ", col1Name, UTF8_QSTRING("≠"), col2Name), "blue"); break; } } break; } case HypothesisTest::Test::Type::Anova: case HypothesisTest::Test::Type::NoneType: break; } if (m_pValue[0] > 1) return 1; return m_pValue[0]; } // Virtual functions QWidget* HypothesisTest::view() const { if (!m_partView) { m_view = new HypothesisTestView(const_cast(this)); m_partView = m_view; } return m_partView; } diff --git a/src/backend/generalTest/HypothesisTest.h b/src/backend/generalTest/HypothesisTest.h index 2723f899e..d76f13412 100644 --- a/src/backend/generalTest/HypothesisTest.h +++ b/src/backend/generalTest/HypothesisTest.h @@ -1,91 +1,91 @@ /*************************************************************************** File : HypothesisTest.h Project : LabPlot Description : Doing Hypothesis-Test on data provided -------------------------------------------------------------------- Copyright : (C) 2019 Devanshu Agarwal(agarwaldevanshu8@gmail.com) ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 51 Franklin Street, Fifth Floor, * * Boston, MA 02110-1301 USA * * * ***************************************************************************/ #ifndef HYPOTHESISTEST_H #define HYPOTHESISTEST_H #include "GeneralTest.h" class HypothesisTest : public GeneralTest { Q_OBJECT public: - explicit HypothesisTest(const QString& name); + explicit HypothesisTest(const QString& name); ~HypothesisTest() override; struct Test { enum Type { NoneType = 0, TTest = 1 << 0, ZTest = 1 << 1, Anova = 1 << 2 }; enum SubType { NoneSubType = 0, TwoSampleIndependent = 1 << 0, TwoSamplePaired = 1 << 1, OneSample = 1 << 2, OneWay = 1 << 3, TwoWay = 1 << 4 }; enum Tail {Positive, Negative, Two}; Type type = NoneType; SubType subtype = NoneSubType; Tail tail; }; void setPopulationMean(QVariant populationMean); void setSignificanceLevel(QVariant alpha); void performTest(Test m_test, bool categoricalVariable = true, bool equalVariance = true); void performLeveneTest(bool categoricalVariable); QList& statisticValue(); QList& pValue(); QWidget* view() const override; private: void performTwoSampleIndependentTest(HypothesisTest::Test::Type test, bool categoricalVariable = false, bool equalVariance = true); void performTwoSamplePairedTest(HypothesisTest::Test::Type test); void performOneSampleTest(HypothesisTest::Test::Type test); void performOneWayAnova(); void performTwoWayAnova(); void m_performLeveneTest(bool categoricalVariable); double getPValue(const HypothesisTest::Test::Type& test, double& value, const QString& col1Name, const QString& col2name, const double mean, const double sp, const int df); double m_populationMean; double m_significanceLevel; HypothesisTest::Test::Tail m_tailType; QList m_pValue; QList m_statisticValue; }; #endif // HypothesisTest_H diff --git a/tests/stats/correlation_coefficient/CorrelationCoefficientTest.cpp b/tests/stats/correlation_coefficient/CorrelationCoefficientTest.cpp index 92d58f156..86c00c699 100644 --- a/tests/stats/correlation_coefficient/CorrelationCoefficientTest.cpp +++ b/tests/stats/correlation_coefficient/CorrelationCoefficientTest.cpp @@ -1,257 +1,259 @@ /*************************************************************************** File : CorrelationCoefficientTest.cpp Project : LabPlot Description : Unit Testing for Correlation Coefficient -------------------------------------------------------------------- Copyright : (C) 2019 Devanshu Agarwal (agarwaldevanshu8@gmail.com) ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 51 Franklin Street, Fifth Floor, * * Boston, MA 02110-1301 USA * * * ***************************************************************************/ #include "CorrelationCoefficientTest.h" #include "backend/generalTest/CorrelationCoefficient.h" #include "backend/core/AbstractColumn.h" #include "backend/core/column/Column.h" void CorrelationCoefficientTest::pearsonCoefficient_data() { QTest::addColumn>("col1Data"); QTest::addColumn>("col2Data"); QTest::addColumn("correlationValue_expected"); QTest::addColumn("zValue_expected"); // First Sample // This sample is taken from: http://learntech.uwe.ac.uk/da/Default.aspx?pageid=1442 QVector col1Data = {56, 56, 65, 65, 50, 25, 87, 44, 35}; QVector col2Data = {87, 91, 85, 91, 75, 28, 122, 66, 58}; double correlationValue_expected = 0.96619424909; double zValue_expected = 0.; QTest::newRow("Sample 1") << col1Data << col2Data << correlationValue_expected << zValue_expected; // Second Sample // This sample is taken from: // https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/correlation-coefficient-formula/ col1Data = {43, 21, 25, 42, 57, 59}; col2Data = {99, 65, 79, 75, 87, 81}; correlationValue_expected = 0.52980897305; zValue_expected = 0.; QTest::newRow("Sample 2") << col1Data << col2Data << correlationValue_expected << zValue_expected; // Third Sample // This sample is taken from: // https://www.myaccountingcourse.com/financial-ratios/correlation-coefficient col1Data = {8, 8, 6, 5, 7, 6}; col2Data = {81, 80, 75, 65, 91, 80}; correlationValue_expected = 0.64755960039; zValue_expected = 0.; QTest::newRow("Sample 3") << col1Data << col2Data << correlationValue_expected << zValue_expected; } void CorrelationCoefficientTest::pearsonCoefficient() { QFETCH(QVector, col1Data); QFETCH(QVector, col2Data); QFETCH(double, correlationValue_expected); QFETCH(double, zValue_expected); Column* col1 = new Column("col1", AbstractColumn::Numeric); Column* col2 = new Column("col2", AbstractColumn::Numeric); col1->replaceValues(0, col1Data); col2->replaceValues(0, col2Data); QVector cols; cols << col1 << col2; CorrelationCoefficient correlationCoefficientTest("Pearson's R"); correlationCoefficientTest.setColumns(cols); CorrelationCoefficient::Test test; test = CorrelationCoefficient::Test::Pearson; bool categoricalVariable = false; correlationCoefficientTest.performTest(test, categoricalVariable); double correlationValue = correlationCoefficientTest.correlationValue(); double zValue = correlationCoefficientTest.statisticValue()[0]; QDEBUG("Correlation Value is " << correlationValue); QDEBUG("Correlation Value Expected is " << correlationValue_expected); QDEBUG("Z Value is: " << zValue); QDEBUG("Z Value Expected is: " << zValue_expected); FuzzyCompare(correlationValue, correlationValue_expected, 1.e-5); FuzzyCompare(zValue, zValue_expected); } void CorrelationCoefficientTest::kendallCoefficient_data() { QTest::addColumn>("col1Values"); QTest::addColumn>("col2Values"); QTest::addColumn>("col1Texts"); QTest::addColumn>("col2Texts"); QTest::addColumn("isDouble"); QTest::addColumn("correlationValue_expected"); QTest::addColumn("zValue_expected"); // First Sample // This sample is taken from: // https://www.statsdirect.com/help/nonparametric_methods/kendall_correlation.htm QVector col1Values = {4, 10, 3, 1, 9, 2, 6, 7, 8, 5}; QVector col2Values = {5, 8, 6, 2, 10, 3, 9, 4, 7, 1}; QVector col1Texts; QVector col2Texts; bool isDouble = true; double correlationValue_expected = 0.51111114025116; double zValue_expected = 2.05718265659; QTest::newRow("Sample 1") << col1Values << col2Values << col1Texts << col2Texts << isDouble << correlationValue_expected << zValue_expected; // Second Sample // This sample is taken from: // https://www.statisticshowto.datasciencecentral.com/kendalls-tau/ col1Texts = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"}; col2Texts = {"A", "B", "D", "C", "F", "E", "H", "G", "J", "I", "L", "K"}; col1Values = {}; col2Values = {}; + isDouble = false; correlationValue_expected = 0.84848484848; - zValue_expected = 3.84676339286; + zValue_expected = 3.84006269541 +; QTest::newRow("Sample 2") << col1Values << col2Values << col1Texts << col2Texts << isDouble << correlationValue_expected << zValue_expected; } void CorrelationCoefficientTest::kendallCoefficient() { QFETCH(QVector, col1Values); QFETCH(QVector, col2Values); QFETCH(QVector, col1Texts); QFETCH(QVector, col2Texts); QFETCH(bool, isDouble); QFETCH(double, correlationValue_expected); QFETCH(double, zValue_expected); Column* col1; Column* col2; if (isDouble){ col1 = new Column("col1", AbstractColumn::Numeric); col2 = new Column("col2", AbstractColumn::Numeric); col1->replaceValues(0, col1Values); col2->replaceValues(0, col2Values); } else { col1 = new Column("col1", AbstractColumn::Text); col2 = new Column("col2", AbstractColumn::Text); col1->replaceTexts(0, col1Texts); col2->replaceTexts(0, col2Texts); } QVector cols; cols << col1 << col2; CorrelationCoefficient correlationCoefficientTest("Kendall's Tau"); correlationCoefficientTest.setColumns(cols); CorrelationCoefficient::Test test; test = CorrelationCoefficient::Test::Kendall; bool categoricalVariable = false; correlationCoefficientTest.performTest(test, categoricalVariable); double correlationValue = correlationCoefficientTest.correlationValue(); double zValue = correlationCoefficientTest.statisticValue()[0]; QDEBUG("Correlation Value is " << correlationValue); QDEBUG("Correlation Value Expected is " << correlationValue_expected); QDEBUG("Z Value is: " << zValue); QDEBUG("Z Value Expected is: " << zValue_expected); FuzzyCompare(correlationValue, correlationValue_expected, 1.e-7); FuzzyCompare(zValue, zValue_expected, 1.e-7); } void CorrelationCoefficientTest::spearmanCoefficient_data() { QTest::addColumn>("col1Data"); QTest::addColumn>("col2Data"); QTest::addColumn("correlationValue_expected"); // First Sample // This sample is taken: // https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php QVector col1Data = {56, 75, 45, 71, 62, 64, 58, 80, 76, 61}; QVector col2Data = {66, 70, 40, 60, 65, 56, 59, 77, 67, 63}; double correlationValue_expected = 0.67272727272; QTest::newRow("Sample 1") << col1Data << col2Data << correlationValue_expected; // Second Sample // This sample is taken from: // https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient col1Data = {106, 86, 100, 101, 99, 103, 97, 113, 112, 110}; col2Data = {7, 0, 27, 50, 28, 29, 20, 12, 6, 17}; correlationValue_expected = -0.17575757575; QTest::newRow("Sample 2") << col1Data << col2Data << correlationValue_expected; } void CorrelationCoefficientTest::spearmanCoefficient() { QFETCH(QVector, col1Data); QFETCH(QVector, col2Data); QFETCH(double, correlationValue_expected); Column* col1 = new Column("col1", AbstractColumn::Numeric); Column* col2 = new Column("col2", AbstractColumn::Numeric); col1->replaceValues(0, col1Data); col2->replaceValues(0, col2Data); QVector cols; cols << col1 << col2; CorrelationCoefficient correlationCoefficientTest("Spearman Rank"); correlationCoefficientTest.setColumns(cols); CorrelationCoefficient::Test test; test = CorrelationCoefficient::Test::Spearman; bool categoricalVariable = false; correlationCoefficientTest.performTest(test, categoricalVariable); double correlationValue = correlationCoefficientTest.correlationValue(); QDEBUG("Correlation Value is " << correlationValue); QDEBUG("Correlation Value Expected is " << correlationValue_expected); FuzzyCompare(correlationValue, correlationValue_expected, 1.e-5); } QTEST_MAIN(CorrelationCoefficientTest) diff --git a/tests/stats/ttest/TTestTest.cpp b/tests/stats/ttest/TTestTest.cpp index ae86edf1c..d665ce45f 100644 --- a/tests/stats/ttest/TTestTest.cpp +++ b/tests/stats/ttest/TTestTest.cpp @@ -1,200 +1,202 @@ /*************************************************************************** File : CorrelationTest.cpp Project : LabPlot Description : Tests for data correlation -------------------------------------------------------------------- Copyright : (C) 2019 Devanshu Agarwal (agarwaldevanshu8@gmail.com) ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 51 Franklin Street, Fifth Floor, * * Boston, MA 02110-1301 USA * * * ***************************************************************************/ #include "TTestTest.h" #include "backend/generalTest/HypothesisTest.h" #include "backend/core/AbstractColumn.h" #include "backend/core/column/Column.h" +//TODO: Decrease relative errors and increase more floating points for expected values. + void TTestTest::twoSampleIndependent_data() { QTest::addColumn>("col1Data"); QTest::addColumn>("col2Data"); QTest::addColumn("tValue_expected"); QTest::addColumn("pValue_expected"); // First Sample // This data set is taken from "JASP" QVector col1Data = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; QVector col2Data = {3, 1, 5, 4, 6, 4, 6, 2, 0, 5, 4, 5, 4, 3, 6, 6, 8, 5, 5, 4, 2, 5, 7, 5}; double tValue_expected = -1.713; double pValue_expected = 0.101; QTest::newRow("invisible cloak") << col1Data << col2Data << tValue_expected << pValue_expected; // Second Sample // This data set is taken from "JASP" col1Data = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; col2Data = {42, 46, 43, 10, 55, 17, 26, 60, 62, 53, 37, 42, 33, 37, 41, 42, 19, 55, 54, 28, 20, 48, 85, 24, 56, 43, 59, 58, 52, 71, 62, 43, 54, 49, 57, 61, 33, 44, 46, 67, 43, 49, 57, 53}; tValue_expected = -2.26; pValue_expected = 0.028; QTest::newRow("directed control activities") << col1Data << col2Data << tValue_expected << pValue_expected; } void TTestTest::twoSampleIndependent() { QFETCH(QVector, col1Data); QFETCH(QVector, col2Data); QFETCH(double, tValue_expected); QFETCH(double, pValue_expected); Column* col1 = new Column("col1", AbstractColumn::Numeric); Column* col2 = new Column("col2", AbstractColumn::Numeric); col1->replaceValues(0, col1Data); col2->replaceValues(0, col2Data); QVector cols; cols << col1 << col2; HypothesisTest tTest("Two Sample Independent"); tTest.setColumns(cols); HypothesisTest::Test test; test.type = HypothesisTest::Test::Type::TTest; test.subtype = HypothesisTest::Test::SubType::TwoSampleIndependent; test.tail = HypothesisTest::Test::Tail::Two; bool categoricalVariable = true; bool equalVariance = true; tTest.performTest(test, categoricalVariable, equalVariance); double tValue = tTest.statisticValue()[0]; double pValue = tTest.pValue()[0]; qDebug() << "tValue is " << tValue; qDebug() << "pValue is: " << pValue; qDebug() << "tValue_expected is " << tValue_expected; qDebug() << "pValue_expected is: " << pValue_expected; FuzzyCompare(tValue, tValue_expected, (0.01) / abs(tValue)); FuzzyCompare(pValue, pValue_expected, (0.01) / abs(pValue)); } void TTestTest::twoSamplePaired_data() { QTest::addColumn>("col1Data"); QTest::addColumn>("col2Data"); QTest::addColumn("tValue_expected"); QTest::addColumn("pValue_expected"); // First Sample // This data set is taken from "JASP" // DATA SET:: Moon and Aggression QVector col1Data = {3.33, 3.67, 2.67, 3.33, 3.33, 3.67, 4.67, 2.67, 6, 4.33, 3.33, 0.67, 1.33, 0.33, 2}; QVector col2Data = {0.27, 0.59, 0.32, 0.19, 1.26, 0.11, 0.3, 0.4, 1.59, 0.6, 0.65, 0.69, 1.26, 0.23, 0.38}; double tValue_expected = 6.452; double pValue_expected = 0.001; QTest::newRow("Moon and Aggression") << col1Data << col2Data << tValue_expected << pValue_expected; } void TTestTest::twoSamplePaired() { QFETCH(QVector, col1Data); QFETCH(QVector, col2Data); QFETCH(double, tValue_expected); QFETCH(double, pValue_expected); Column* col1 = new Column("col1", AbstractColumn::Numeric); Column* col2 = new Column("col2", AbstractColumn::Numeric); col1->replaceValues(0, col1Data); col2->replaceValues(0, col2Data); QVector cols; cols << col1 << col2; HypothesisTest tTest("Two Sample Paried"); tTest.setColumns(cols); HypothesisTest::Test test; test.type = HypothesisTest::Test::Type::TTest; test.subtype = HypothesisTest::Test::SubType::TwoSamplePaired; test.tail = HypothesisTest::Test::Tail::Two; tTest.performTest(test); double tValue = tTest.statisticValue()[0]; double pValue = tTest.pValue()[0]; qDebug() << "tValue is " << tValue; qDebug() << "pValue is: " << pValue; qDebug() << "tValue_expected is " << tValue_expected; qDebug() << "pValue_expected is: " << pValue_expected; FuzzyCompare(tValue, tValue_expected, (0.01) / abs(tValue)); FuzzyCompare(pValue, pValue_expected, (0.01) / abs(pValue)); } void TTestTest::oneSample_data() { QTest::addColumn>("col1Data"); QTest::addColumn("populationMean"); QTest::addColumn("tValue_expected"); QTest::addColumn("pValue_expected"); // First Sample // This data set is taken from "JASP" // DATA SET:: Weight Gain; QVector col1Data = {13.2, 8.58, 14.08, 8.58, 10.56, 14.74, 7.92, 13.2, 12.76, 5.72, 11.66, 7.04, 3.08, 15.62, 14.3, 5.5}; double populationMean = 16; double tValue_expected = -5.823; double pValue_expected = 0.001; QTest::newRow("weight gain") << col1Data << populationMean << tValue_expected << pValue_expected; } void TTestTest::oneSample() { QFETCH(QVector, col1Data); QFETCH(double, populationMean); QFETCH(double, tValue_expected); QFETCH(double, pValue_expected); Column* col1 = new Column("col1", AbstractColumn::Numeric); col1->replaceValues(0, col1Data); QVector cols; cols << col1; HypothesisTest tTest("One Sample"); tTest.setColumns(cols); tTest.setPopulationMean(populationMean); HypothesisTest::Test test; test.type = HypothesisTest::Test::Type::TTest; test.subtype = HypothesisTest::Test::SubType::OneSample; test.tail = HypothesisTest::Test::Tail::Two; tTest.performTest(test); double tValue = tTest.statisticValue()[0]; double pValue = tTest.pValue()[0]; qDebug() << "tValue is " << tValue; qDebug() << "pValue is: " << pValue; qDebug() << "tValue_expected is " << tValue_expected; qDebug() << "pValue_expected is: " << pValue_expected; FuzzyCompare(tValue, tValue_expected, (0.01) / fabs(tValue)); FuzzyCompare(pValue, pValue_expected, (0.01) / fabs(pValue)); } QTEST_MAIN(TTestTest)