diff --git a/src/backend/generalTest/CorrelationCoefficient.cpp b/src/backend/generalTest/CorrelationCoefficient.cpp index 4a20ba843..f1f609936 100644 --- a/src/backend/generalTest/CorrelationCoefficient.cpp +++ b/src/backend/generalTest/CorrelationCoefficient.cpp @@ -1,423 +1,424 @@ /*************************************************************************** File : CorrelationCoefficient.cpp Project : LabPlot Description : Finding Correlation Coefficient on data provided -------------------------------------------------------------------- Copyright : (C) 2019 Devanshu Agarwal(agarwaldevanshu8@gmail.com) ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 51 Franklin Street, Fifth Floor, * * Boston, MA 02110-1301 USA * * * ***************************************************************************/ #include "CorrelationCoefficient.h" #include "GeneralTest.h" #include "kdefrontend/generalTest/CorrelationCoefficientView.h" #include "backend/spreadsheet/Spreadsheet.h" #include "backend/core/column/Column.h" #include "backend/lib/macros.h" #include #include #include #include #include #include #include #include #include #include #include #include #include extern "C" { #include "backend/nsl/nsl_stats.h" } CorrelationCoefficient::CorrelationCoefficient(const QString& name) : GeneralTest(name, AspectType::CorrelationCoefficient) { } CorrelationCoefficient::~CorrelationCoefficient() { } void CorrelationCoefficient::performTest(int test, bool categoricalVariable, bool calculateStats) { m_statsTable = ""; m_correlationValue = 0; m_statisticValue.clear(); m_pValue.clear(); m_inputStatsTableModel->clear(); for (int i = 0; i < RESULTLINESCOUNT; i++) m_resultLine[i]->clear(); switch (testType(test)) { case CorrelationCoefficient::Pearson: { m_currTestName = "

" + i18n("Pearson's r Correlation Test") + "

"; performPearson(categoricalVariable); break; } case CorrelationCoefficient::Kendall: m_currTestName = "

" + i18n("Kendall's Rank Correlation Test") + "

"; performKendall(); break; case CorrelationCoefficient::Spearman: { m_currTestName = "

" + i18n("Spearman Correlation Coefficient Test") + "

"; performSpearman(); break; } case CorrelationCoefficient::ChiSquare: switch (testSubtype(test)) { case CorrelationCoefficient::IndependenceTest: performChiSquareIndpendence(calculateStats); break; } break; } emit changed(); } double CorrelationCoefficient::correlationValue() const { return m_correlationValue; } QList CorrelationCoefficient::statisticValue() const { return m_statisticValue; } QList CorrelationCoefficient::pValue() const { return m_pValue; } /*************************************************************************************************************************** * Private Implementations * ************************************************************************************************************************/ /*********************************************Pearson r ******************************************************************/ //Formulaes are taken from https://www.statisticssolutions.com/correlation-pearson-kendall-spearman/ // variables: // N = total number of observations // sumColx = sum of values in colx // sumSqColx = sum of square of values in colx // sumColxColy = sum of product of values in colx and coly //TODO: support for col1 is categorical. //TODO: add tooltip for correlation value result //TODO: find p value void CorrelationCoefficient::performPearson(bool categoricalVariable) { if (m_columns.count() != 2) { printError("Select only 2 columns "); return; } if (categoricalVariable) { printLine(1, "currently categorical variable not supported", "blue"); return; } QString col1Name = m_columns[0]->name(); QString col2Name = m_columns[1]->name(); if (!m_columns[1]->isNumeric()) printError("Column " + col2Name + " should contain only numeric or interger values"); int N = findCount(m_columns[0]); if (N != findCount(m_columns[1])) { printError("Number of data values in Column: " + col1Name + "and Column: " + col2Name + "are not equal"); return; } double sumCol1 = findSum(m_columns[0], N); double sumCol2 = findSum(m_columns[1], N); double sumSqCol1 = findSumSq(m_columns[0], N); double sumSqCol2 = findSumSq(m_columns[1], N); double sumCol12 = 0; for (int i = 0; i < N; i++) sumCol12 += m_columns[0]->valueAt(i) * m_columns[1]->valueAt(i); // printing table; // HtmlCell constructor structure; data, level, rowSpanCount, m_columnspanCount, isHeader; QList rowMajor; int level = 0; // horizontal header QString sigma = UTF8_QSTRING("Σ"); rowMajor.append(new HtmlCell("", level, true)); rowMajor.append(new HtmlCell("N", level, true, "Total Number of Observations")); rowMajor.append(new HtmlCell(QString(sigma + "Scores"), level, true, "Sum of Scores in each column")); rowMajor.append(new HtmlCell(QString(sigma + "Scores2"), level, true, "Sum of Squares of scores in each column")); rowMajor.append(new HtmlCell(QString(sigma + "(" + UTF8_QSTRING("∏") + "Scores)"), level, true, "Sum of product of scores of both columns")); //data with vertical header. level++; rowMajor.append(new HtmlCell(col1Name, level, true)); rowMajor.append(new HtmlCell(N, level)); rowMajor.append(new HtmlCell(sumCol1, level)); rowMajor.append(new HtmlCell(sumSqCol1, level)); rowMajor.append(new HtmlCell(sumCol12, level, false, "", 2, 1)); level++; rowMajor.append(new HtmlCell(col2Name, level, true)); rowMajor.append(new HtmlCell(N, level)); rowMajor.append(new HtmlCell(sumCol2, level)); rowMajor.append(new HtmlCell(sumSqCol2, level)); m_statsTable += getHtmlTable3(rowMajor); m_correlationValue = (N * sumCol12 - sumCol1*sumCol2) / sqrt((N * sumSqCol1 - gsl_pow_2(sumCol1)) * (N * sumSqCol2 - gsl_pow_2(sumCol2))); printLine(0, QString("Correlation Value is %1").arg(round(m_correlationValue)), "green"); } /***********************************************Kendall ******************************************************************/ // used knight algorithm for fast performance O(nlogn) rather than O(n^2) // http://adereth.github.io/blog/2013/10/30/efficiently-computing-kendalls-tau/ // TODO: Change date format type to original for numeric type; // TODO: add tooltips. // TODO: Compute tauB for ties. // TODO: find P Value from Z Value void CorrelationCoefficient::performKendall() { if (m_columns.count() != 2) { printError("Select only 2 columns "); return; } QString col1Name = m_columns[0]->name(); QString col2Name = m_columns[1]->name(); int N = findCount(m_columns[0]); if (N != findCount(m_columns[1])) { printError("Number of data values in Column: " + col1Name + "and Column: " + col2Name + "are not equal"); return; } QVector col2Ranks(N); if (m_columns[0]->isNumeric()) { if (m_columns[0]->isNumeric() && m_columns[1]->isNumeric()) { for (int i = 0; i < N; i++) col2Ranks[int(m_columns[0]->valueAt(i)) - 1] = int(m_columns[1]->valueAt(i)); } else { printError(QString("Ranking System should be same for both Column: %1 and Column: %2
" "Hint: Check for data types of columns").arg(col1Name).arg(col2Name)); return; } } else { AbstractColumn::ColumnMode origCol1Mode = m_columns[0]->columnMode(); AbstractColumn::ColumnMode origCol2Mode = m_columns[1]->columnMode(); m_columns[0]->setColumnMode(AbstractColumn::Text); m_columns[1]->setColumnMode(AbstractColumn::Text); QMap ValueToRank; for (int i = 0; i < N; i++) { if (ValueToRank[m_columns[0]->textAt(i)] != 0) { printError("Currently ties are not supported"); m_columns[0]->setColumnMode(origCol1Mode); m_columns[1]->setColumnMode(origCol2Mode); return; } ValueToRank[m_columns[0]->textAt(i)] = i + 1; } for (int i = 0; i < N; i++) col2Ranks[i] = ValueToRank[m_columns[1]->textAt(i)]; m_columns[0]->setColumnMode(origCol1Mode); m_columns[1]->setColumnMode(origCol2Mode); } int nPossiblePairs = (N * (N - 1)) / 2; int nDiscordant = findDiscordants(col2Ranks.data(), 0, N - 1); int nCorcordant = nPossiblePairs - nDiscordant; m_correlationValue = double(nCorcordant - nDiscordant) / nPossiblePairs; m_statisticValue.append((3 * (nCorcordant - nDiscordant)) / sqrt(N * (N- 1) * (2 * N + 5) / 2)); printLine(0, QString("Number of Discordants are %1").arg(nDiscordant), "green"); printLine(1, QString("Number of Concordant are %1").arg(nCorcordant), "green"); printLine(2, QString("Tau a is %1").arg(round(m_correlationValue)), "green"); printLine(3, QString("Z Value is %1").arg(round(m_statisticValue[0])), "green"); return; } /***********************************************Spearman ******************************************************************/ // All formulaes and symbols are taken from : https://www.statisticshowto.datasciencecentral.com/spearman-rank-correlation-definition-calculate/ void CorrelationCoefficient::performSpearman() { if (m_columns.count() != 2) { printError("Select only 2 columns "); return; } QString col1Name = m_columns[0]->name(); QString col2Name = m_columns[1]->name(); int N = findCount(m_columns[0]); if (N != findCount(m_columns[1])) { printError("Number of data values in Column: " + col1Name + "and Column: " + col2Name + "are not equal"); return; } QMap col1Ranks; convertToRanks(m_columns[0], N, col1Ranks); QMap col2Ranks; convertToRanks(m_columns[1], N, col2Ranks); double ranksCol1Mean = 0; double ranksCol2Mean = 0; for (int i = 0; i < N; i++) { ranksCol1Mean += col1Ranks[int(m_columns[0]->valueAt(i))]; ranksCol2Mean += col2Ranks[int(m_columns[1]->valueAt(i))]; } ranksCol1Mean /= N; ranksCol2Mean /= N; double s12 = 0; double s1 = 0; double s2 = 0; for (int i = 0; i < N; i++) { double centeredRank_1 = col1Ranks[int(m_columns[0]->valueAt(i))] - ranksCol1Mean; double centeredRank_2 = col2Ranks[int(m_columns[1]->valueAt(i))] - ranksCol2Mean; s12 += centeredRank_1 * centeredRank_2; s1 += gsl_pow_2(centeredRank_1); s2 += gsl_pow_2(centeredRank_2); } s12 /= N; s1 /= N; s2 /= N; m_correlationValue = s12 / std::sqrt(s1 * s2); printLine(0, QString("Spearman Rank Correlation value is %1").arg(m_correlationValue), "green"); } /***********************************************Chi Square Test for Indpendence******************************************************************/ +// TODO: Implement this function void CorrelationCoefficient::performChiSquareIndpendence(bool calculateStats) { Q_UNUSED(calculateStats); } /***********************************************Helper Functions******************************************************************/ int CorrelationCoefficient::findDiscordants(int *ranks, int start, int end) { if (start >= end) return 0; int mid = (start + end) / 2; int leftDiscordants = findDiscordants(ranks, start, mid); int rightDiscordants = findDiscordants(ranks, mid + 1, end); int len = end - start + 1; int leftLen = mid - start + 1; int rightLen = end - mid; int leftLenRemain = leftLen; QVector leftRanks(leftLen); QVector rightRanks(rightLen); for (int i = 0; i < leftLen; i++) leftRanks[i] = ranks[start + i]; for (int i = leftLen; i < leftLen + rightLen; i++) rightRanks[i - leftLen] = ranks[start + i]; int mergeDiscordants = 0; int i = 0, j = 0, k =0; while (i < len) { if (j >= leftLen) { ranks[start + i] = rightRanks[k]; k++; } else if (k >= rightLen) { ranks[start + i] = leftRanks[j]; j++; } else if (leftRanks[j] < rightRanks[k]) { ranks[start + i] = leftRanks[j]; j++; leftLenRemain--; } else if (leftRanks[j] > rightRanks[k]) { ranks[start + i] = rightRanks[k]; mergeDiscordants += leftLenRemain; k++; } i++; } return leftDiscordants + rightDiscordants + mergeDiscordants; } void CorrelationCoefficient::convertToRanks(const Column* col, int N, QMap &ranks) { if (col->isNumeric()) return; double* sortedList = new double[N]; for (int i = 0; i < N; i++) sortedList[i] = col->valueAt(i); std::sort(sortedList, sortedList + N, std::greater()); ranks.clear(); for (int i = 0; i < N; i++) ranks[sortedList[i]] = i + 1; delete[] sortedList; } void CorrelationCoefficient::convertToRanks(const Column* col, QMap &ranks) { convertToRanks(col, findCount(col), ranks); } /***********************************************Virtual Functions******************************************************************/ QWidget* CorrelationCoefficient::view() const { if (!m_partView) { m_view = new CorrelationCoefficientView(const_cast(this)); m_partView = m_view; } return m_partView; } diff --git a/src/backend/generalTest/HypothesisTest.cpp b/src/backend/generalTest/HypothesisTest.cpp index 872afbf67..9f858c094 100644 --- a/src/backend/generalTest/HypothesisTest.cpp +++ b/src/backend/generalTest/HypothesisTest.cpp @@ -1,1167 +1,1159 @@ /*************************************************************************** File : HypothesisTest.cpp Project : LabPlot Description : Doing Hypothesis-Test on data provided -------------------------------------------------------------------- Copyright : (C) 2019 Devanshu Agarwal(agarwaldevanshu8@gmail.com) ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the Free Software * * Foundation, Inc., 51 Franklin Street, Fifth Floor, * * Boston, MA 02110-1301 USA * * * ***************************************************************************/ #include "HypothesisTest.h" #include "kdefrontend/generalTest/HypothesisTestView.h" #include "backend/spreadsheet/Spreadsheet.h" #include "backend/core/column/Column.h" #include "backend/lib/macros.h" #include #include #include #include #include #include #include #include extern "C" { #include "backend/nsl/nsl_stats.h" } HypothesisTest::HypothesisTest(const QString &name) : GeneralTest(name, AspectType::HypothesisTest) { } HypothesisTest::~HypothesisTest() { } void HypothesisTest::setPopulationMean(QVariant populationMean) { m_populationMean = populationMean.toDouble(); } void HypothesisTest::setSignificanceLevel(QVariant alpha) { m_significanceLevel = alpha.toDouble(); } void HypothesisTest::setTail(HypothesisTailType tail) { m_tail = tail; } void HypothesisTest::performTest(int test, bool categoricalVariable, bool equalVariance, bool calculateStats) { m_pValue.clear(); m_statisticValue.clear(); m_statsTable = ""; for (int i = 0; i < RESULTLINESCOUNT; i++) { m_resultLine[i]->clear(); m_resultLine[i]->setToolTip(QString()); } if (calculateStats) m_inputStatsTableModel->clear(); switch (testSubtype(test)) { case TwoSampleIndependent: { m_currTestName = "

" + i18n("Two Sample Independent Test") + "

"; performTwoSampleIndependentTest(testType(test), categoricalVariable, equalVariance, calculateStats); break; } case TwoSamplePaired: m_currTestName = "

" + i18n("Two Sample Paired Test") + "

"; performTwoSamplePairedTest(testType(test)); break; case OneSample: { m_currTestName = "

" + i18n("One Sample Test") + "

"; performOneSampleTest(testType(test)); break; } case OneWay: { m_currTestName = "

" + i18n("One Way Anova") + "

"; performOneWayAnova(); break; } case TwoWay: { m_currTestName = "

" + i18n("Two Way Anova") + "

"; performTwoWayAnova(); break; } } emit changed(); } void HypothesisTest::performLeveneTest(bool categoricalVariable) { m_pValue.clear(); m_statisticValue.clear(); m_statsTable = ""; for (int i = 0; i < RESULTLINESCOUNT; i++) { m_resultLine[i]->clear(); m_resultLine[i]->setToolTip(QString()); } m_currTestName = "

" + i18n("Levene Test for Equality of Variance") + "

"; m_performLeveneTest(categoricalVariable); emit changed(); } void HypothesisTest::initInputStatsTable(int test, bool calculateStats) { m_inputStatsTableModel->clear(); if (!calculateStats) { - switch (testSubtype(test)) { - case TwoSampleIndependent: { + if (testSubtype(test) == TwoSampleIndependent) { m_inputStatsTableModel->setRowCount(2); m_inputStatsTableModel->setColumnCount(4); m_inputStatsTableModel->setHorizontalHeaderLabels( {i18n("N"), i18n("Sum"), i18n("Mean"), i18n("Standard Deviation")}); - break; - } - case TwoSamplePaired: - case OneSample: - case OneWay: - case TwoWay: - break; } } emit changed(); } QList& HypothesisTest::statisticValue() { return m_statisticValue; } QList& HypothesisTest::pValue() { return m_pValue; } /****************************************************************************** * Private Implementations * ****************************************************************************/ //TODO: backend of z test; //TODO: use https://www.gnu.org/software/gsl/doc/html/statistics.html for basic statistic calculations /**************************Two Sample Independent *************************************/ // program is crashing on some input. void HypothesisTest::performTwoSampleIndependentTest(int test, bool categoricalVariable, bool equalVariance, bool calculateStats) { int n[2]; double sum[2], mean[2], std[2]; QString col1Name; QString col2Name; if (!calculateStats) { QString textValue; QDEBUG("m_inputStatsTable row and column count " << m_inputStatsTableModel->rowCount() << m_inputStatsTableModel->columnCount()); for (int i = 0; i < 2; i++) { n[i] = m_inputStatsTableModel->data(m_inputStatsTableModel->index(i, 0)).toInt(); sum[i] = m_inputStatsTableModel->data(m_inputStatsTableModel->index(i, 1)).toDouble(); mean[i] = m_inputStatsTableModel->data(m_inputStatsTableModel->index(i, 2)).toDouble(); std[i] = m_inputStatsTableModel->data(m_inputStatsTableModel->index(i, 3)).toDouble(); if (sum[i] == 0.0) sum[i] = mean[i] * n[i]; if (mean[i] == 0.0 && n[i] > 0) mean[i] = sum[i] / n[i]; col1Name = "1"; col2Name = "2"; } } else { if (m_columns.size() != 2) { printError("Inappropriate number of m_columns selected"); return; } col1Name = m_columns[0]->name(); col2Name = m_columns[1]->name(); if (!categoricalVariable && m_columns[0]->isNumeric()) { for (int i = 0; i < 2; i++) { findStats(m_columns[i], n[i], sum[i], mean[i], std[i]); if (n[i] == 0) { printError("At least two values should be there in every column"); return; } if (std[i] == 0.0) { printError(i18n("Standard Deviation of at least one column is equal to 0: last column is: %1", m_columns[i]->name())); return; } } } else { QMap colName; QString baseColName; int np; int totalRows; countPartitions(m_columns[0], np, totalRows); if (np != 2) { printError( i18n("Number of Categorical Variable in Column %1 is not equal to 2", m_columns[0]->name())); return; } if (m_columns[0]->isNumeric()) baseColName = m_columns[0]->name(); GeneralErrorType errorCode = findStatsCategorical(m_columns[0], m_columns[1], n, sum, mean, std, colName, np, totalRows); switch (errorCode) { case ErrorUnqualSize: { printError( i18n("Unequal size between Column %1 and Column %2", m_columns[0]->name(), m_columns[1]->name())); return; } case ErrorEmptyColumn: { printError("At least one of selected column is empty"); return; } case NoError: break; } QMapIterator iter(colName); while (iter.hasNext()) { iter.next(); if (iter.value() == 1) col1Name = baseColName + " " + iter.key(); else col2Name = baseColName + " " + iter.key(); } } } QVariant rowMajor[] = {"", "N", "Sum", "Mean", "Std", col1Name, n[0], sum[0], mean[0], std[0], col2Name, n[1], sum[1], mean[1], std[1] }; m_statsTable = getHtmlTable(3, 5, rowMajor); for (int i = 0; i < 2; i++) { if (n[i] == 0) { if (calculateStats) printError("At least two values should be there in every column"); else printError("In Statistic Table you filled above, value of N for every row should be > 0"); return; } if (std[i] == 0.0) { printError( i18n("Standard Deviation of at least one column is equal to 0: last column is: %1", m_columns[i]->name())); return; } } double stdSq[2] = {gsl_pow_2(std[0]), gsl_pow_2(std[1])}; QString testName; int df = 0; double spSq = 0; switch (testType(test)) { case TTest: { testName = "T"; if (equalVariance) { df = n[0] + n[1] - 2; spSq = ((n[0]-1) * stdSq[0] + (n[1]-1) * stdSq[1] ) / df; // QDEBUG("equal variance : spSq is " << spSq); m_statisticValue.append((mean[0] - mean[1]) / sqrt(spSq / n[0] + spSq / n[1])); printLine(9, "Assumption: Equal Variance b/w both population means"); } else { double temp_val; temp_val = gsl_pow_2( gsl_pow_2(std[0]) / n[0] + gsl_pow_2(std[1]) / n[1]); temp_val = temp_val / ( (gsl_pow_2( (gsl_pow_2(std[0]) / n[0]) ) / (n[0]-1)) + (gsl_pow_2( (gsl_pow_2(std[1]) / n[1]) ) / (n[1]-1))); df = qRound(temp_val); m_statisticValue.append((mean[0] - mean[1]) / (sqrt( (gsl_pow_2(std[0])/n[0]) + (gsl_pow_2(std[1])/n[1])))); printLine(9, "Assumption: UnEqual Variance b/w both population means"); } printLine(6, i18n("Degree of Freedom is %1", df), "green"); printTooltip(6, i18n("Number of independent Pieces of information that went into calculating the estimate")); printLine(8, "Assumption: Both Populations approximately follow normal distribution"); break; } case ZTest: { testName = "Z"; spSq = stdSq[0] / n[0] + stdSq[1] / n[1]; double zValue = (mean[0] - mean[1]) / sqrt(spSq); m_statisticValue.append(zValue); break; } } m_currTestName = "

" + i18n("Two Sample Independent %1 Test for %2 vs %3", testName, col1Name, col2Name) + "

"; m_pValue.append(getPValue(test, m_statisticValue[0], col1Name, col2Name, df)); printLine(2, i18n("Significance level is %1", round(m_significanceLevel)), "blue"); printLine(4, i18n("%1 Value is %2 ", testName, round(m_statisticValue[0])), "green"); printTooltip(4, i18n("More is the |%1-value|, more safely we can reject the null hypothesis", testName)); printLine(5, i18n("P Value is %1 ", m_pValue[0]), "green"); printTooltip(5, getPValueTooltip(m_pValue[0])); return; } /********************************Two Sample Paired ***************************************/ void HypothesisTest::performTwoSamplePairedTest(int test) { if (m_columns.size() != 2) { printError("Inappropriate number of m_columns selected"); return; } for (int i = 0; i < 2; i++) { if ( !m_columns[0]->isNumeric()) { printError("select only m_columns with numbers"); return; } } int n; double sum, mean, std; GeneralErrorType errorCode = findStatsPaired(m_columns[0], m_columns[1], n, sum, mean, std); switch (errorCode) { case ErrorUnqualSize: { printError("both m_columns are having different sizes"); return; } case ErrorEmptyColumn: { printError("m_columns are empty"); return; } case NoError: break; } QVariant rowMajor[] = {"", "N", "Sum", "Mean", "Std", "difference", n, sum, mean, std }; m_statsTable = getHtmlTable(2, 5, rowMajor); if (std == 0.0) { printError("Standard deviation of the difference is 0"); return; } QString testName; int df = 0; switch (test) { case TTest: { m_statisticValue[0] = mean / (std / sqrt(n)); df = n - 1; testName = "T"; printLine(6, i18n("Degree of Freedom is %1name(), i18n("%1", m_populationMean), df)); m_currTestName = "

" + i18n("One Sample %1 Test for %2 vs %3", testName, m_columns[0]->name(), m_columns[1]->name()) + "

"; printLine(2, i18n("Significance level is %1 ", round(m_significanceLevel)), "blue"); printLine(4, i18n("%1 Value is %2 ", testName, round(m_statisticValue[0])), "green"); printLine(5, i18n("P Value is %1 ", m_pValue[0]), "green"); printTooltip(5, getPValueTooltip(m_pValue[0])); return; } /******************************** One Sample ***************************************/ void HypothesisTest::performOneSampleTest(int test) { if (m_columns.size() != 1) { printError("Inappropriate number of m_columns selected"); return; } if ( !m_columns[0]->isNumeric()) { printError("select only m_columns with numbers"); return; } int n; double sum, mean, std; GeneralErrorType errorCode = findStats(m_columns[0], n, sum, mean, std); switch (errorCode) { case ErrorEmptyColumn: { printError("column is empty"); return; } case NoError: break; case ErrorUnqualSize: { return; } } QVariant rowMajor[] = {"", "N", "Sum", "Mean", "Std", m_columns[0]->name(), n, sum, mean, std }; m_statsTable = getHtmlTable(2, 5, rowMajor); if (std == 0.0) { printError("Standard deviation is 0"); return; } QString testName; int df = 0; switch (test) { case TTest: { testName = "T"; m_statisticValue.append((mean - m_populationMean) / (std / sqrt(n))); df = n - 1; printLine(6, i18n("Degree of Freedom is %1", df), "blue"); break; } case ZTest: { testName = "Z"; df = 0; m_statisticValue.append((mean - m_populationMean) / (std / sqrt(n))); break; } } m_pValue.append(getPValue(test, m_statisticValue[0], m_columns[0]->name(), i18n("%1",m_populationMean), df)); m_currTestName = "

" + i18n("One Sample %1 Test for %2", testName, m_columns[0]->name()) + "

"; printLine(2, i18n("Significance level is %1", round(m_significanceLevel)), "blue"); printLine(4, i18n("%1 Value is %2", testName, round(m_statisticValue[0])), "green"); printLine(5, i18n("P Value is %1", m_pValue[0]), "green"); printTooltip(5, getPValueTooltip(m_pValue[0])); return; } /*************************************One Way Anova***************************************/ // all standard variables and formulas are taken from this wikipedia page: // https://en.wikipedia.org/wiki/One-way_analysis_of_variance // b stands for b/w groups // w stands for within groups // np is number of partition i.e., number of classes void HypothesisTest::performOneWayAnova() { int np, totalRows; countPartitions(m_columns[0], np, totalRows); int* ni = new int[np]; double* sum = new double[np]; double* mean = new double[np]; double* std = new double[np]; QString* colNames = new QString[np]; QMap classnameToIndex; QString baseColName; if (m_columns[0]->isNumeric()) baseColName = m_columns[0]->name(); findStatsCategorical(m_columns[0], m_columns[1], ni, sum, mean, std, classnameToIndex, np, totalRows); double yBar = 0; // overall mean double sB = 0; // sum of squares of (mean - overall_mean) between the groups int fB = 0; // degree of freedom between the groups double msB = 0; // mean sum of squares between the groups double sW = 0; // sum of squares of (value - mean of group) within the groups int fW = 0; // degree of freedom within the group double msW = 0; // mean sum of squares within the groups // now finding mean of each group; for (int i = 0; i < np; i++) yBar += mean[i]; yBar /= np; for (int i = 0; i < np; i++) { sB += ni[i] * gsl_pow_2(mean[i] - yBar); if (ni[i] > 1) sW += gsl_pow_2(std[i])*(ni[i] - 1); else sW += gsl_pow_2(std[i]); fW += ni[i] - 1; } fB = np - 1; msB = sB / fB; msW = sW / fW; m_statisticValue.append(msB / msW); m_pValue.append(nsl_stats_fdist_p(m_statisticValue[0], static_cast(np-1), fW)); QMapIterator i(classnameToIndex); while (i.hasNext()) { i.next(); colNames[i.value()-1] = baseColName + " " + i.key(); } // now printing the statistics and result; int rowCount = np + 1, columnCount = 5; QVariant* rowMajor = new QVariant[rowCount*columnCount]; // header data; rowMajor[0] = ""; rowMajor[1] = "Ni"; rowMajor[2] = "Sum"; rowMajor[3] = "Mean"; rowMajor[4] = "Std"; // table data for (int row_i = 1; row_i < rowCount ; row_i++) { rowMajor[row_i*columnCount] = colNames[row_i - 1]; rowMajor[row_i*columnCount + 1] = ni[row_i - 1]; rowMajor[row_i*columnCount + 2] = sum[row_i - 1]; rowMajor[row_i*columnCount + 3] = mean[row_i - 1]; rowMajor[row_i*columnCount + 4] = std[row_i - 1]; } m_statsTable = "

" + i18n("Group Summary Statistics") + "

"; m_statsTable += getHtmlTable(rowCount, columnCount, rowMajor); m_statsTable += getLine(""); m_statsTable += getLine(""); m_statsTable += "

" + i18n("Grand Summary Statistics") + "

"; m_statsTable += getLine(""); m_statsTable += getLine(i18n("Overall Mean is %1", round(yBar))); rowCount = 4; columnCount = 3; rowMajor->clear(); rowMajor[0] = ""; rowMajor[1] = "Between Groups"; rowMajor[2] = "Within Groups"; int baseIndex = 0; baseIndex = 1 * columnCount; rowMajor[baseIndex + 0] = "Sum of Squares"; rowMajor[baseIndex + 1] = sB; rowMajor[baseIndex + 2] = sW; baseIndex = 2 * columnCount; rowMajor[baseIndex + 0] = "Degree of Freedom"; rowMajor[baseIndex + 1] = fB; rowMajor[baseIndex + 2] = fW; baseIndex = 3 * columnCount; rowMajor[baseIndex + 0] = "Mean Square Value"; rowMajor[baseIndex + 1] = msB; rowMajor[baseIndex + 2] = msW; m_statsTable += getHtmlTable(rowCount, columnCount, rowMajor); delete[] ni; delete[] sum; delete[] mean; delete[] std; delete[] colNames; printLine(1, i18n("F Value is %1", round(m_statisticValue[0])), "green"); printLine(2, i18n("P Value is %1 ", m_pValue[0]), "green"); printTooltip(2, getPValueTooltip(m_pValue[0])); return; } /*************************************Two Way Anova***************************************/ // all formulas and symbols are taken from: http://statweb.stanford.edu/~susan/courses/s141/exanova.pdf //TODO: suppress warning of variable length array are a C99 feature. //TODO: add assumptions verification option //TODO: add tail option (if needed) void HypothesisTest::performTwoWayAnova() { int np_a, totalRows_a; int np_b, totalRows_b; countPartitions(m_columns[0], np_a, totalRows_a); countPartitions(m_columns[1], np_b, totalRows_b); QVector> groupMean(np_a, QVector(np_b)); QVector> replicates(np_a, QVector(np_b)); for (int i = 0; i < np_a; i++) for (int j = 0; j < np_b; j++) { groupMean[i][j] = 0; replicates[i][j] = 0; } if (totalRows_a != totalRows_b) { printError("There is missing data in at least one of the rows"); return; } QMap catToNumber_a; QMap catToNumber_b; int partitionNumber_a = 1; int partitionNumber_b = 1; for (int i = 0; i < totalRows_a; i++) { QString name_a = m_columns[0]->textAt(i); QString name_b = m_columns[1]->textAt(i); double value = m_columns[2]->valueAt(i); if (catToNumber_a[name_a] == 0) { catToNumber_a[name_a] = partitionNumber_a; partitionNumber_a++; } if (catToNumber_b[name_b] == 0) { catToNumber_b[name_b] = partitionNumber_b; partitionNumber_b++; } groupMean[catToNumber_a[name_a] - 1][catToNumber_b[name_b] - 1] += value; replicates[catToNumber_a[name_a] - 1][catToNumber_b[name_b] - 1] += 1; } int replicate = replicates[0][0]; for (int i = 0; i < np_a; i++) for (int j = 0; j < np_b; j++) { if (replicates[i][j] == 0) { printError("Dataset should have at least one data value corresponding to each feature combination"); return; } if (replicates[i][j] != replicate) { printError("Number of experiments perfomed for each combination of levels
" "between Independet Var.1 and Independent Var.2 must be equal"); return; } groupMean[i][j] /= replicates[i][j]; } double ss_within = 0; for (int i = 0; i < totalRows_a; i++) { QString name_a = m_columns[0]->textAt(i); QString name_b = m_columns[1]->textAt(i); double value = m_columns[2]->valueAt(i); ss_within += gsl_pow_2(value - groupMean[catToNumber_a[name_a] - 1][catToNumber_b[name_b] - 1]); } int df_within = (replicate - 1) * np_a * np_b; double ms_within = ss_within / df_within; double* mean_a = new double[np_a]; double* mean_b = new double[np_b]; for (int i = 0; i < np_a; i++) { for (int j = 0; j < np_b; j++) { mean_a[i] += groupMean[i][j] / np_b; mean_b[j] += groupMean[i][j] / np_a; } } double mean = 0; for (int i = 0; i < np_a; i++) mean += mean_a[i] / np_a; double ss_a = 0; for (int i = 0; i < np_a; i++) ss_a += gsl_pow_2(mean_a[i] - mean); ss_a *= replicate * np_b; int df_a = np_a - 1; double ms_a = ss_a / df_a; double ss_b = 0; for (int i = 0; i < np_b; i++) ss_b += gsl_pow_2(mean_b[i] - mean); ss_b *= replicate * np_a; int df_b = np_b - 1; double ms_b = ss_b / df_b; double ss_interaction = 0; for (int i = 0; i < np_a; i++) for (int j = 0; j < np_b; j++) ss_interaction += gsl_pow_2(groupMean[i][j] - mean_a[i] - mean_b[j] + mean); ss_interaction *= replicate; int df_interaction = (np_a - 1) * (np_b - 1); double ms_interaction = ss_interaction / df_interaction; QString* partitionNames_a = new QString[np_a]; QString* partitionNames_b = new QString[np_b]; QMapIterator itr_a(catToNumber_a); while (itr_a.hasNext()) { itr_a.next(); partitionNames_a[itr_a.value()-1] = itr_a.key(); } QMapIterator itr_b(catToNumber_b); while (itr_b.hasNext()) { itr_b.next(); partitionNames_b[itr_b.value()-1] = itr_b.key(); } // printing table; // HtmlCell constructor structure; data, level, rowSpanCount, m_columnspanCount, isHeader; QList rowMajor; rowMajor.append(new HtmlCell("", 0, true, "", 2, 1)); for (int i = 0; i < np_b; i++) rowMajor.append(new HtmlCell(partitionNames_b[i], 0, true, "", 1, 2)); rowMajor.append(new HtmlCell("Mean", 0, true, "", 2)); for (int i = 0; i < np_b; i++) { rowMajor.append(new HtmlCell("Mean", 1, true)); rowMajor.append(new HtmlCell("Replicate", 1, true)); } int level = 2; for (int i = 0; i < np_a; i++) { rowMajor.append(new HtmlCell(partitionNames_a[i], level, true)); for (int j = 0; j < np_b; j++) { rowMajor.append(new HtmlCell(round(groupMean[i][j]), level)); rowMajor.append(new HtmlCell(replicates[i][j], level)); } rowMajor.append(new HtmlCell(round(mean_a[i]), level)); level++; } rowMajor.append(new HtmlCell("Mean", level, true)); for (int i = 0; i < np_b; i++) rowMajor.append(new HtmlCell(round(mean_b[i]), level, false, "", 1, 2)); rowMajor.append(new HtmlCell(round(mean), level)); m_statsTable = "

" + i18n("Contingency Table") + "

"; m_statsTable += getHtmlTable3(rowMajor); m_statsTable += "
"; m_statsTable += "

" + i18n("results table") + "

"; rowMajor.clear(); level = 0; rowMajor.append(new HtmlCell("", level, true)); rowMajor.append(new HtmlCell("SS", level, true)); rowMajor.append(new HtmlCell("DF", level, true, "degree of freedom")); rowMajor.append(new HtmlCell("MS", level, true)); level++; rowMajor.append(new HtmlCell(m_columns[0]->name(), level, true)); rowMajor.append(new HtmlCell(round(ss_a), level)); rowMajor.append(new HtmlCell(df_a, level)); rowMajor.append(new HtmlCell(round(ms_a), level)); level++; rowMajor.append(new HtmlCell(m_columns[1]->name(), level, true)); rowMajor.append(new HtmlCell(round(ss_b), level)); rowMajor.append(new HtmlCell(df_b, level)); rowMajor.append(new HtmlCell(round(ms_b), level)); level++; rowMajor.append(new HtmlCell("Interaction", level, true)); rowMajor.append(new HtmlCell(round(ss_interaction), level)); rowMajor.append(new HtmlCell(df_interaction, level)); rowMajor.append(new HtmlCell(round(ms_interaction), level)); level++; rowMajor.append(new HtmlCell("Within", level, true)); rowMajor.append(new HtmlCell(round(ss_within), level)); rowMajor.append(new HtmlCell(df_within, level)); rowMajor.append(new HtmlCell(round(ms_within), level)); m_statsTable += getHtmlTable3(rowMajor); double fValue_a = ms_a / ms_within; double fValue_b = ms_b / ms_within; double fValue_interaction = ms_interaction / ms_within; double m_pValue_a = nsl_stats_fdist_p(fValue_a, static_cast(np_a - 1), df_a); double m_pValue_b = nsl_stats_fdist_p(fValue_b, static_cast(np_b - 1), df_b); printLine(0, "F(df" + m_columns[0]->name() + ", dfwithin) is " + round(fValue_a), "blue"); printLine(1, "F(df" + m_columns[1]->name() + ", dfwithin) is " + round(fValue_b), "blue"); printLine(2, "F(dfinteraction, dfwithin) is " + round(fValue_interaction), "blue"); printLine(4, "P(df" + m_columns[0]->name() + ", dfwithin) is " + round(m_pValue_a), "blue"); printLine(5, "P(df" + m_columns[1]->name() + ", dfwithin) is " + round(m_pValue_b), "blue"); // printLine(2, "P(dfinteraction, dfwithin) is " + round(fValue_interaction), "blue"); m_statisticValue.append(fValue_a); m_statisticValue.append(fValue_b); m_statisticValue.append(fValue_interaction); m_pValue.append(m_pValue_a); m_pValue.append(m_pValue_b); delete[] mean_a; delete[] mean_b; delete[] partitionNames_a; delete[] partitionNames_b; return; } /**************************************Levene Test****************************************/ // Some reference to local variables. // np = number of partitions // df = degree of fredom // totalRows = total number of rows in column // these variables are taken from: https://en.wikipedia.org/wiki/Levene%27s_test // yiBar = mean of ith group; // Zij = |Yij - yiBar| // ziBar = mean of Zij for group i // ziBarBar = mean for all zij // ni = number of elements in group i void HypothesisTest::m_performLeveneTest(bool categoricalVariable) { if (m_columns.size() != 2) { printError("Inappropriate number of m_columns selected"); return; } int np = 0; int n = 0; if (!categoricalVariable && m_columns[0]->isNumeric()) np = m_columns.size(); else countPartitions(m_columns[0], np, n); if (np < 2) { printError("Select at least two m_columns / classes"); return; } double* yiBar = new double[np]; double* ziBar = new double[np]; double ziBarBar = 0; double* ni = new double[np]; for (int i = 0; i < np; i++) { yiBar[i] = 0; ziBar[i] = 0; ni[i] = 0; } double fValue; int df = 0; int totalRows = 0; QString* colNames = new QString[np]; if (!categoricalVariable && m_columns[0]->isNumeric()) { totalRows = m_columns[0]->rowCount(); double value = 0; for (int j = 0; j < totalRows; j++) { int numberNaNCols = 0; for (int i = 0; i < np; i++) { value = m_columns[i]->valueAt(j); if (std::isnan(value)) { numberNaNCols++; continue; } yiBar[i] += value; ni[i]++; n++; } if (numberNaNCols == np) { totalRows = j; break; } } for (int i = 0; i < np; i++) { if (ni[i] > 0) yiBar[i] /= ni[i]; else { printError("One of the selected m_columns is empty
" "or have choosen Independent Var.1 wrongly"); return; } } for (int j = 0; j < totalRows; j++) { for (int i = 0; i < np; i++) { value = m_columns[i]->valueAt(j); if (!(std::isnan(value))) ziBar[i] += fabs(value - yiBar[i]); } } for (int i = 0; i < np; i++) { ziBarBar += ziBar[i]; if (ni[i] > 0) ziBar[i] /= ni[i]; } ziBarBar /= n; double numberatorValue = 0; double denominatorValue = 0; for (int j = 0; j < totalRows; j++) { for (int i = 0; i < np; i++) { value = m_columns[i]->valueAt(j); if (!(std::isnan(value))) { double zij = fabs(value - yiBar[i]); denominatorValue += gsl_pow_2( (zij - ziBar[i])); } } } if (denominatorValue == 0.0) { printError( i18n("Denominator value is %1", denominatorValue)); return; } for (int i = 0; i < np; i++) { colNames[i] = m_columns[i]->name(); numberatorValue += ni[i]*gsl_pow_2( (ziBar[i]-ziBarBar)); } fValue = ((n - np) / (np - 1)) * (numberatorValue / denominatorValue); } else { QMap classnameToIndex; AbstractColumn::ColumnMode originalColMode = m_columns[0]->columnMode(); m_columns[0]->setColumnMode(AbstractColumn::Text); int partitionNumber = 1; QString name; double value; int classIndex; for (int j = 0; j < n; j++) { name = m_columns[0]->textAt(j); value = m_columns[1]->valueAt(j); if (std::isnan(value)) { n = j; break; } if (classnameToIndex[name] == 0) { classnameToIndex[name] = partitionNumber; partitionNumber++; } classIndex = classnameToIndex[name]-1; ni[classIndex]++; yiBar[classIndex] += value; } for (int i = 0; i < np; i++) { if (ni[i] > 0) yiBar[i] /= ni[i]; else { printError("One of the selected m_columns is empty
" "or have choosen Independent Var.1 wrongly"); m_columns[0]->setColumnMode(originalColMode); return; } } for (int j = 0; j < n; j++) { name = m_columns[0]->textAt(j); value = m_columns[1]->valueAt(j); classIndex = classnameToIndex[name] - 1; ziBar[classIndex] += fabs(value - yiBar[classIndex]); } for (int i = 0; i < np; i++) { ziBarBar += ziBar[i]; ziBar[i] /= ni[i]; } ziBarBar /= n; double numberatorValue = 0; double denominatorValue = 0; for (int j = 0; j < n; j++) { name = m_columns[0]->textAt(j); value = m_columns[1]->valueAt(j); classIndex = classnameToIndex[name] - 1; double zij = fabs(value - yiBar[classIndex]); denominatorValue += gsl_pow_2( (zij - ziBar[classIndex])); } for (int i = 0; i < np; i++) numberatorValue += ni[i]*gsl_pow_2( (ziBar[i]-ziBarBar)); if (denominatorValue == 0.0) { printError( "number of data points is less or than equal to number of categorical variables"); m_columns[0]->setColumnMode(originalColMode); return; } fValue = ((n - np) / (np - 1)) * (numberatorValue / denominatorValue); QMapIterator i(classnameToIndex); while (i.hasNext()) { i.next(); colNames[i.value()-1] = m_columns[0]->name() + " " + i.key(); } m_columns[0]->setColumnMode(originalColMode); } df = n - np; // now making the stats table. int rowCount = np+1; int columnCount = 4; QVariant* rowMajor = new QVariant[rowCount*columnCount]; // header data; rowMajor[0] = ""; rowMajor[1] = "Ni"; rowMajor[2] = "yiBar"; rowMajor[3] = "ziBar"; // table data for (int row_i = 1; row_i < rowCount; row_i++) { rowMajor[row_i*columnCount] = colNames[row_i-1]; rowMajor[row_i*columnCount + 1] = ni[row_i-1]; rowMajor[row_i*columnCount + 2] = yiBar[row_i-1]; rowMajor[row_i*columnCount + 3] = ziBar[row_i-1]; } m_statsTable = getHtmlTable(rowCount, columnCount, rowMajor); delete[] rowMajor; delete[] yiBar; delete[] ziBar; delete[] ni; m_pValue.append(nsl_stats_fdist_p(fValue, static_cast(np-1), df)); printLine(0, "Null Hypothesis: Variance is equal between all classes", "blue"); printLine(1, "Alternate Hypothesis: Variance is not equal in at-least one pair of classes", "blue"); printLine(2, i18n("Significance level is %1", round(m_significanceLevel)), "blue"); printLine(4, i18n("F Value is %1 ", round(fValue)), "green"); printLine(5, i18n("P Value is %1 ", m_pValue[0]), "green"); printLine(6, i18n("Degree of Freedom is %1", df), "green"); printTooltip(5, getPValueTooltip(m_pValue[0])); if (m_pValue[0] <= m_significanceLevel) printLine(8, "Requirement for homogeneity is not met", "red"); else printLine(8, "Requirement for homogeneity is met", "green"); m_statisticValue.append(fValue); return; } //TODO change ("⋖") symbol to ("<"), currently macro UTF8_QSTRING is not working properly if used "<" symbol; // TODO: check for correctness between: for TestZ with TailTwo // m_pValue.append(2*gsl_cdf_tdist_P(value, df) v/s // m_pValue.append(gsl_cdf_tdis_P(value, df) + gsl_cdf_tdis_P(-value, df); double HypothesisTest::getPValue(const int& test, double& value, const QString& col1Name, const QString& col2Name, const int df) { double pValue = 0; QString nullHypothesisSign; QString alternateHypothesisSign; switch (testType(test)) { case TTest: { switch (m_tail) { case Negative: pValue = gsl_cdf_tdist_P(value, df); nullHypothesisSign = UTF8_QSTRING("≥"); alternateHypothesisSign = UTF8_QSTRING("⋖"); break; case Positive: value *= -1; pValue = gsl_cdf_tdist_P(value, df); nullHypothesisSign = UTF8_QSTRING("≤"); alternateHypothesisSign = UTF8_QSTRING(">"); break; case Two: pValue = 2.*gsl_cdf_tdist_P(-fabs(value), df); nullHypothesisSign = UTF8_QSTRING("="); alternateHypothesisSign = UTF8_QSTRING("≠"); break; } break; } case ZTest: { switch (m_tail) { case Negative: pValue = gsl_cdf_ugaussian_P(value); nullHypothesisSign = UTF8_QSTRING("≥"); alternateHypothesisSign = UTF8_QSTRING("⋖"); break; case Positive: value *= -1; pValue = gsl_cdf_ugaussian_P(value); nullHypothesisSign = UTF8_QSTRING("≤"); alternateHypothesisSign = UTF8_QSTRING(">"); break; case Two: pValue = 2*gsl_cdf_ugaussian_P(-fabs(value)); nullHypothesisSign = UTF8_QSTRING("="); alternateHypothesisSign = UTF8_QSTRING("≠"); break; } break; } } printLine(0, i18n("Null Hypothesis: Population mean of %1 %2 Population mean of %3 ", col1Name, nullHypothesisSign, col2Name), "blue"); printLine(1, i18n("Alternate Hypothesis: Population mean of %1 %2 Population mean of %3 ", col1Name, alternateHypothesisSign, col2Name), "blue"); if (pValue > 1) return 1; return pValue; } QString HypothesisTest::getPValueTooltip(const double &pValue) { if (pValue <= m_significanceLevel) return i18n("We can safely reject Null Hypothesis for significance level %1", round(m_significanceLevel)); return i18n("There is a plausibility for Null Hypothesis to be true"); } // Virtual functions QWidget* HypothesisTest::view() const { if (!m_partView) { m_view = new HypothesisTestView(const_cast(this)); m_partView = m_view; } return m_partView; }