2009-10-02 19 views
8

Mam ramkę danych i chcę utworzyć tabelę statystyk podsumowujących, w tym liczbę prawidłowych wartości liczbowych, średnią i sd według grup dla każdej z trzech kolumn. Nie mogę znaleźć żadnej funkcji do policzenia liczby wartości numerycznych w R. Mogę użyć length(), która mówi mi, ile wartości istnieją, i mogę użyć colSums (is.na (x)), aby policzyć liczba wartości NA, ale colSums (is.numeric (x)) nie działa w ten sam sposób.Jak liczyć liczbę wartości numerycznych w kolumnie

Mogę użyć tapply z {length - liczba wartości NA - liczba pustych wartości - liczba wartości tekstowych}, ale na pewno jest łatwiejszy sposób.

moje dane (chcę przez grupy nominalne, i produkować statystyk podsumowania na rzeczywiste, LinPred i QualPred)

structure(list(Nominal = c(1, 3, 6, 10, 30, 50, 150, 250, 1, 
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250, 1, 
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250, 1, 
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250, 1, 
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250), Actual = c(NA, 
0.422, 0.782, 1.25, 3.85, 6.94, 18.8, 31.2, 0.118, 0.361, 0.747, 
1.18, 3.58, 5.82, 16.7, 29, 0.113, 0.382, 0.692, 1.12, 3.51, 
5.43, 17.1, 28.7, 0.134, 0.402, 0.718, 1.25, 3.65, 6.52, NA, 
28.8, 0.123, 0.396, 0.664, 1.12, 3.83, 5.6, NA, 28.1, 0.112, 
0.341, 0.7, 1.08, 3.25, 5.97, NA, 27.1, 0.106, 0.35, 0.674, 1.14, 
3.28, 5.5, 17.3, 30, 0.122, 0.321, 0.673, 1.22, 3.41, 5.85, 17.6, 
28.1, 0.129, 0.351, 0.737, 1.06, 3.39, 5.53, 15.9, 28.5), LinPred = c(NA, 
3.49519490135683, 6.4706724568458, 10.3387932789814, 31.8283534019573, 
57.3678690865708, 155.393324109068, 257.881995464799, 0.982569410055046, 
2.99101676001009, 6.18138991672881, 9.76022819874748, 29.5967452353405, 
48.1108278028274, 138.036371702049, 239.698521514589, 0.941243332895477, 
3.16458628408028, 5.72680306797355, 9.26431527283265, 29.0181801551066, 
44.887393784381, 141.342457874815, 237.218956885015, 1.07941778099747, 
3.36900393602722, 6.0686652233011, 10.6136646056736, 31.1174212178803, 
55.6364968333108, NA, 245.979704049963, 0.98544222985819, 3.3177445444967, 
5.60733069952645, 9.50304445584572, 32.6552029637958, 47.7767234652982, 
NA, 239.999441704736, 0.89146667871891, 2.8478667888003, 5.91488704870955, 
9.1613151789756, 27.7001284491792, 50.9377192763467, NA, 231.456209782983, 
0.887738051402174, 3.04188235451485, 5.9023034783202, 10.0163659588551, 
28.9092709123842, 48.5084526866061, 152.684283738776, 264.805729023739, 
1.02899341554071, 2.78585700701375, 5.89347501806154, 10.7226427795477, 
30.0569707460098, 51.5984137771366, 155.332821816374, 248.031654532288, 
1.09079263735132, 3.05071081477351, 6.45849647461568, 9.31008913816238, 
29.8804015408367, 48.7733064943658, 140.324439376654, 251.563038635751 
), QuadPred = c(NA, 3.46077095737974, 6.38659713413108, 10.1956079501556, 
31.4700369979564, 57.0089799611706, 157.775316006369, 268.303966059862, 
0.99289436409299, 2.96536517477853, 6.10198249392715, 9.62549220297933, 
29.2517496204359, 47.7196128593832, 139.600469198163, 248.272682787657, 
0.95232583127381, 3.13590297331348, 5.65480031033985, 9.13693141349813, 
28.6769820181676, 44.4936547741659, 143.050878627236, 245.555818447238, 
1.08417831830729, 3.33895371044810, 6.00044125019758, 10.4882228621509, 
30.8451526869812, 55.4331759085967, NA, 256.446833964951, 0.991679220421247, 
3.28844923081897, 5.54540949253351, 9.3907657095483, 32.3793538902883, 
47.5218142460371, NA, 249.828516445647, 0.899183876120787, 2.82554368740693, 
5.84875388286628, 9.05319326862309, 27.4395572248486, 50.7001828907023, 
NA, 240.411024762687, 0.884412915928806, 3.05257006009469, 5.93046554432476, 
10.0673979669, 29.0311859234644, 48.645035648271, 151.914544909710, 
261.273991566153, 1.02660962824666, 2.79491765184684, 5.92158513760114, 
10.7773327827008, 30.1813919027873, 51.7318741314584, 154.518856412401, 
245.027488125567, 1.08881969774848, 3.06145444119556, 6.48990638077339, 
9.35738460692028, 30.0044505131336, 48.9096796323938, 139.747394069421, 
248.451100154569)), .Names = c("Nominal", "Actual", "LinPred", 
"QuadPred"), row.names = c(NA, -72L), class = "data.frame") 

Odpowiedz

9

Oto kilka dodatkowe pakiety, które mogłyby pomóc (patrz Quick-R)

Korzystanie pakiet

library(Hmisc) 

describe(mydata) 
# n, nmiss, unique, mean, 5,10,25,50,75,90,95th percentiles 
# 5 lowest and 5 highest scores 

Hmisc Korzystanie pakiet

library(pastecs) 

stat.desc(mydata) 
# nbr.val, nbr.null, nbr.na, min max, range, sum, 
# median, mean, SE.mean, CI.mean, var, std.dev, coef.var 

pastecs Używanie psych pakiet

library(psych) 
describe(mydata) 
# item name ,item number, nvalid, mean, sd, 
# median, mad, min, max, skew, kurtosis, se 

Używałbym opisać z pakietu psychicznego;

> describe.by(biastable, as.factor(Nominal)) 
group: 1 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 NaN  NaN 0.00 
Actual  2 8 0.12 0.01 0.12 0.12 0.01 0.11 0.13 0.03 0.09 -1.47 0.00 
LinPred 3 8 0.99 0.08 0.98 0.99 0.10 0.89 1.09 0.20 0.04 -1.70 0.03 
QuadPred 4 8 0.99 0.08 0.99 0.99 0.10 0.88 1.09 0.20 -0.04 -1.64 0.03 
------------------------------------------------------------------------ 
group: 3 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 3.00 0.00 3.00 3.00 0.00 3.00 3.00 0.00 NaN  NaN 0.00 
Actual  2 9 0.37 0.03 0.36 0.37 0.03 0.32 0.42 0.10 0.15 -1.50 0.01 
LinPred 3 9 3.12 0.24 3.05 3.12 0.30 2.79 3.50 0.71 0.15 -1.52 0.08 
QuadPred 4 9 3.10 0.23 3.06 3.10 0.34 2.79 3.46 0.67 0.12 -1.51 0.08 
------------------------------------------------------------------------ 
group: 6 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 6.00 0.00 6.00 6.00 0.00 6.00 6.00 0.00 NaN  NaN 0.00 
Actual  2 9 0.71 0.04 0.70 0.71 0.04 0.66 0.78 0.12 0.46 -1.30 0.01 
LinPred 3 9 6.02 0.30 5.91 6.02 0.28 5.61 6.47 0.86 0.28 -1.43 0.10 
QuadPred 4 9 5.99 0.31 5.93 5.99 0.25 5.55 6.49 0.94 0.26 -1.26 0.10 
------------------------------------------------------------------------ 
group: 10 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 10.00 0.00 10.00 10.00 0.00 10.00 10.00 0.00 NaN  NaN 0.00 
Actual  2 9 1.16 0.07 1.14 1.16 0.09 1.06 1.25 0.19 0.09 -1.71 0.02 
LinPred 3 9 9.85 0.60 9.76 9.85 0.74 9.16 10.72 1.56 0.24 -1.76 0.20 
QuadPred 4 9 9.79 0.62 9.63 9.79 0.72 9.05 10.78 1.72 0.27 -1.65 0.21 
------------------------------------------------------------------------ 
group: 30 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 30.00 0.00 30.00 30.00 0.00 30.00 30.00 0.00 NaN  NaN 0.00 
Actual  2 9 3.53 0.22 3.51 3.53 0.21 3.25 3.85 0.60 0.23 -1.58 0.07 
LinPred 3 9 30.08 1.55 29.88 30.08 1.44 27.70 32.66 4.96 0.21 -1.27 0.52 
QuadPred 4 9 29.92 1.51 30.00 29.92 1.44 27.44 32.38 4.94 0.04 -1.22 0.50 
------------------------------------------------------------------------ 
group: 50 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 50.00 0.00 50.00 50.00 0.00 50.00 50.00 0.00 NaN  NaN 0.00 
Actual  2 9 5.91 0.51 5.82 5.91 0.43 5.43 6.94 1.51 0.90 -0.73 0.17 
LinPred 3 9 50.40 3.98 48.77 50.40 3.21 44.89 57.37 12.48 0.49 -1.16 1.33 
QuadPred 4 9 50.24 3.97 48.91 50.24 2.65 44.49 57.01 12.52 0.39 -1.21 1.32 
------------------------------------------------------------------------ 
group: 150 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 150.00 0.00 150.00 150.00 0.00 150.00 150.00 0.00 NaN  NaN 0.00 
Actual  2 6 17.23 0.97 17.20 17.23 0.67 15.90 18.80 2.90 0.25 -1.23 0.39 
LinPred 3 6 147.19 8.11 147.01 147.19 11.13 138.04 155.39 17.36 -0.01 -2.22 3.31 
QuadPred 4 6 147.77 7.95 147.48 147.77 10.95 139.60 157.78 18.17 0.07 -2.10 3.25 
------------------------------------------------------------------------ 
group: 250 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 250.00 0.00 250.00 250.00 0.00 250.00 250.00 0.00 NaN  NaN 0.00 
Actual  2 9 28.83 1.18 28.70 28.83 0.89 27.10 31.20 4.10 0.59 -0.57 0.39 
LinPred 3 9 246.29 10.57 245.98 246.29 9.31 231.46 264.81 33.35 0.33 -1.26 3.52 
QuadPred 4 9 251.51 8.84 248.45 251.51 5.08 240.41 268.30 27.89 0.62 -1.04 2.95 
> 
3

można użyć coś takiego?

length(unique(x)) 
0

Czy complete.cases (lub sum(complete.cases)) Rób, co chcesz?

3

Co to są "puste wartości" i "wartości tekstowe"? Jeśli masz wektor liczbowy, możesz mieć wartości NA (is.na()), Inf (is.infinite()), NaN (is.nan()) i "prawidłowe" wartości numeryczne.

Dla "valid" wartościami liczbowymi (w sensie wyżej) można użyć is.finite():

is.finite(c(1,NA,Inf,NaN)) 
# [1] TRUE FALSE FALSE FALSE 
sum(is.finite(c(1,NA,Inf,NaN))) 
# [1] 1 

Więc colSums(is.numeric(x)) można zrobić jak colSums(is.finite(x)).

+0

+1 Dokładnie zanotuj, o co prosił OP, ale dokładnie to, czego szukałem :-) – scraimer

6

colSums(!is.na(x)) powinien działać.

Powiązane problemy