#################################################################### ## General info: ## ## ## ## This script creates all statistics, data sets and figures ## ## as well as data for some of the tables in Kopf (2021) in order ## ## of appearance in the paper. Please download the associated ## ## data set (csv) at https://grammis.ids-mannheim.de/genitivvondb ## ## ## ## Reference: ## ## Kristin Kopf. 2021. Stellung des adnominalen Genitivs. ## ## In Marek Konopka, Angelika Wöllstein & Ekkehard Felder (eds.), ## ## Determination, syntaktische Funktionen von Nominalphrasen und ## ## Attribution (Bausteine einer Korpusgrammatik des Deutschen 2). ## ## Heidelberg: Heidelberg University Press. ## #################################################################### set.seed(20190527) library('fmsb') library('dplyr') library('ggplot2') library('DAMisc') # import data genvon_en <- read.delim(file.choose(new = FALSE), encoding = "utf8", sep="\t", quote="", header=TRUE) # select csv-file on your computer # set decimal separator to comma options(OutDec= ",") ######################################################## ############### sort & combine levels ################## ######################################################## # sort genvon_en$Type2 <- factor(genvon_en$Type2, levels=c("von", "gen")) genvon_en$Source_type <- factor(genvon_en$Source_type, levels=c("newspaper", "forum")) genvon_en$PersonNameAttr <- factor(genvon_en$PersonNameAttr, levels=c("pet", "given", "title+given", "family", "title+family", "given+family", "irrelevant")) genvon_en$SemNameAttr <- factor(genvon_en$SemNameAttr, levels=c("person", "organization", "sports_club", "band", "country", "place", "misc", "product")) genvon_en$CaseNP <- factor(genvon_en$CaseNP, levels=c("nom", "other")) genvon_en$AppositionNP <- factor(genvon_en$AppositionNP, levels=c("n", "y")) # combine genvon_en$SemNameAttr <- recode(genvon_en$SemNameAttr, person = "Person", country = "Toponym", place = "Toponym", organization = "Gruppe", misc = "Diverses", sports_club = "Gruppe", band = "Gruppe", product = "Diverses") genvon_en$AppositionNP <- recode(genvon_en$AppositionNP, "y" = "y", "n" ="n", "not_checked" = "n") genvon_en$FinalAttr <- recode(genvon_en$FinalAttr, "s" = "s", "other" = "other", "irrelevant" = "other") # truly irrelevant cases are excluded later # add genvon_en$SyllablesAttr2 <- recode(genvon_en$SyllablesAttr, `1` = "1", `2` = "2", `3` = "3", `4`= "4", .default = "5+") # cells without value receive "5+" but are dropped later # replace genvon_en$Decade[which(genvon_en$Decade==1990)] <- 0 genvon_en$Decade[which(genvon_en$Decade==2000)] <- 1 genvon_en$Decade[which(genvon_en$Decade==2010)] <- 2 genvon_en$SyllablesAttr <- as.numeric(as.character(genvon_en$SyllablesAttr)) # empty cells (NAs) will be dropped later genvon_en$LengthNameAttr <- as.numeric(as.character(genvon_en$LengthNameAttr)) # empty cells (NAs) will be dropped later #################### ## make data sets ## #################### # create a data set of proper names in the genitive (irregardless of grammatical properties) genvon_names <- droplevels(subset(genvon_en, IncludePos == "y" & SemReplacement != "n" & NameGrammarAttr !="common_noun" & NameGrammarAttr !="phrase_name" & NameAttr !="misc" & Type !="von")) nrow(genvon_names) #2371 # create dataset genvon_en genvon_en <- droplevels(subset(genvon_en, IncludePos == "y" & DefinitenessHead == "d" & DeterminerHead != "other" & DefinitenessAttr == "d" & NameGrammarAttr == "bare_name" & SemNameAttr != "Diverses" & SemNameAttr != "Produkt" & SemReplacement != "n" & TitleAttr != "y_phrase")) nrow(genvon_en) #1832 # create data set prepost prepost <- droplevels(subset(genvon_en, Type2 == "gen" )) nrow(prepost) #850 # create data set prepost-dev for model selection prepost.devel <- droplevels(subset(prepost, PrepostPartition == "devel" )) nrow(prepost.devel) #277 # create data set prepost-fin for final model prepost.test <- droplevels(subset(prepost, PrepostPartition == "test" )) nrow(prepost.test) #573 ######################################################## ## Kopf (2021), fig. 1 (genitives and *von*-phrases) ### ######################################################## # reorder levels for figure genvon_en$Type <- factor(genvon_en$Type, levels=c("post", "pre", "von")) # basic settings for all plots theme_set(theme_light() + theme(axis.title.y = element_text(size = 14), # title y-axis axis.title.x = element_text(size = 14), # title x-axis axis.text.x = element_text(size = 14, color = "black"), # categories and font size on x-axis axis.text.y = element_text(size = 14, color = "black"), # values and font size on y-axis legend.title=element_text(size=14, color = "black"), # legend title legend.text=element_text(size=14, color = "black"))) # legend categories # create fig. 1 genvon_en_graph <- ggplot(as.data.frame((table(genvon_en$Type, genvon_en$Type2))), aes(x = Var2, y=Freq, fill=Var1)) + geom_bar(stat="identity") + geom_text(label="post: 253 (29,76 %)", x= 2.25, y = 720, color = "black", size = 5) + geom_text(label="die Verfehlungen\nClintons", x= 1.86, y = 720, color = "black", size = 5, fontface="italic") + geom_text(label="prä: 597 (70,24 %)", x= 2.2, y = 300, color = "black", size = 5) + geom_text(label="Köhlers Entscheidung", x= 1.9, y = 300, color = "black", size = 5, fontface="italic") + geom_text(label="982", x= 1.2, y = 500, color = "black", size = 5) + geom_text(label="die Verpflichtungen von Stöger", x= 0.9, y = 500, color = "black", size = 5, fontface="italic") + scale_x_discrete(labels = c(expression(paste("analoge ", italic("von"),"-Kontexte")), "Wahlkontexte Genitiv")) + ylab("") + xlab("") + scale_y_continuous(breaks =seq(0,1000,200)) + coord_flip() + theme(legend.position = "none") + #keine Legende (sonst: top, left, right, bottom; right = default, wenn sonst nichts angegeben) theme(aspect.ratio = .35, #hier kann Verhältnis zwischen Abbildung/Balken zu Text verändert werden panel.grid.major.y = element_blank(), #keine (Haupt-)Gitterlinien auf y-Achse panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) #keine Hilfsgitterlinien auf x-Achse # change colors show(genvon_en_graph <- genvon_en_graph + scale_fill_manual(values = c("pre" = "#7AD151FF", "post" = "#DCE319FF", "von"= "#1F968BFF" ))) #viridis ################################################## ## statistics development set (model selection) ## ################################################## # change order of levels so pre is placed left on potential figures prepost.devel$Type <- factor(prepost.devel$Type, levels=c("pre", "post", "von")) prepost.predictors <- c( "Source_type", # medium "Decade", # decade "SemNameAttr", # name type of attribute "SyllablesAttr", # no. of syllables of attribute "CaseNP" # case of overall noun phrase ) # for comments on excluded predictors see Kopf (2021) m.prepost.dev <- (glm(paste("Type", " ~ ", paste(prepost.predictors, collapse = " + ")), family = "binomial", data = prepost.devel)) summary(m.prepost.dev) ## model fit & prediction accuracy # model fit NagelkerkeR2(m.prepost.dev) # contingency table predicted vs. observed xtabs(~ifelse(predict(m.prepost.dev, type = "response") > .5, 1,0) + m.prepost.dev$y) # proportion of correctly predicted cases (model) sum(diag(xtabs(~ifelse(predict(m.prepost.dev, type = "response") > .5, 1,0) + m.prepost.dev$y)))/length(m.prepost.dev$y) # 0.787 # proportion of correctly predicted cases (intercept-only model) pred.interceptonly <- round(sum(m.prepost.dev$y)/length(m.prepost.dev$y),0) preds.interceptonly <- rep(pred.interceptonly, length(m.prepost.dev$y)) sum(diag(xtabs(~preds.interceptonly + m.prepost.dev$y)))/length(m.prepost.dev$y) # 0.715 ## proportional reduction in error (PRE) # classification errors (model) (e.model <- length(m.prepost.dev$y) - sum(diag(xtabs(~ifelse(predict(m.prepost.dev, type = "response") > .5, 1,0) + m.prepost.dev$y)))) # 59 # classification errors (intercept-only model) (e.null <- length(m.prepost.dev$y) - max(c(sum(m.prepost.dev$y), length(m.prepost.dev$y) -sum(m.prepost.dev$y)))) # 79 # PRE (calculated by hand) round((e.null-e.model)/e.null, 3) # 0.253 # PRE (using DAMisc::pre) round(pre(m.prepost.dev)$pre, 3) # 0.253 ####################################### ## statistics test set (final model) ## ####################################### # change order of levels so pre is placed left on the figure prepost.test$Type <- factor(prepost.test$Type, levels=c("pre", "post", "von")) # model summary, Kopf (2021), tab. 8 m.final <- (glm(paste("Type", " ~ ", paste(prepost.predictors, collapse = " + ")), family = "binomial", data = prepost.test)) summary(m.final) ## model fit & prediction accuracy # model fit NagelkerkeR2(m.final) # 0.257 # contingency table predicted vs. observed xtabs(~ifelse(predict(m.final, type = "response") > .5, 1,0) + m.final$y) # proportion of correctly predicted cases (model) sum(diag(xtabs(~ifelse(predict(m.final, type = "response") > .5, 1,0) + m.final$y)))/nrow(prepost.test) # 0.773 # proportion of correctly predicted cases (intercept-only model) (nrow(prepost.test) - sum(m.final$y))/nrow(prepost.test) # 0.696 ## proportional reduction in error (PRE): # classification errors (model) (e.model <- nrow(prepost.test) - sum(diag(xtabs(~ifelse(predict(m.final, type = "response") > .5, 1,0) + m.final$y)))) # 130 # classification errors (intercept-only model) (e.null <- nrow(prepost.test) - max(c(sum(m.final$y), nrow(prepost.test)-sum(m.final$y)))) # 174 # PRE (calculated by hand) round((e.null-e.model)/e.null, 3) # 0.253 # PRE (using DAMisc::pre) round(pre(m.final)$pre, 3) ##################################################################### ## Kopf (2021), fig. 2 (coefficients and 95% confidence intervals) ## ##################################################################### # coefficient estimates coeffs <- coef(m.final) # 95% confidence intervals ci.wald <- confint(m.final, method="Wald") # create new data frame for ggplot2 coeffs2 <- as.data.frame(coeffs) coeffs2$var <- rownames(coeffs2) coeffs2 <- cbind.data.frame(coeffs2,ci.wald) colnames(coeffs2) <- c("coeffs", "var", "lower", "upper") coeffs2$mycolour <- ifelse(data.table::between(0, coeffs2$lower, coeffs2$upper), "gray", "green3") # rename labels coeffs2$var2 <- c("Konstante (Intercept)", "Beleg ist aus Forum (vs. Zeitung), 3.2", "Jahrzehnt, 3.2", "Genitiv ist Kollektiv (vs. Person), 3.1.3", "Genitiv ist Toponym (vs. Person), 3.1.3", "Silbenzahl Genitiv, 3.1.2", "NP ist Nicht-Nominativ (vs. Nominativ), 3.1.1") # cut intercept coeffs2 <- coeffs2[2:nrow(coeffs2), ] # create dotplot (fig. 2) ggplot(coeffs2) + # dots geom_point(aes(x = coeffs, y = reorder(var2, coeffs), color = mycolour), size=3) + scale_color_identity() + labs(title="", subtitle="", y="", x="", caption="") + # confidence intervals geom_segment(aes(x=lower, xend=upper, y=var2, yend=var2, col= mycolour), size=1) + # vertical line at x = 0 geom_segment(aes(x=0, xend=0, y=0, yend=nrow(coeffs2)+1), size = 0.3, col = "lightgray") ######################################################## ## Kopf (2021), fig. 3 (no. of syllables in test set) ## ######################################################## # re-sort levels for figure prepost.test$Type <- factor(prepost.test$Type, levels=c("post", "pre", "von")) # summarize data by no. of syllables of attribute and position (pre- or postnominal) data_summary <- prepost.test %>% group_by(SyllablesAttr2) %>% mutate(n_obs = n()) %>% ungroup() %>% group_by(SyllablesAttr2, Type, n_obs) %>% summarise(freq = n()) %>% mutate(prop = freq/n_obs) %>% ungroup() %>% mutate(prop.dec = format(prop, decimal.mark = ",")) # create fig. 3 silben <- ggplot(data_summary, aes(x = SyllablesAttr2, y = prop, fill = Type)) + geom_bar(stat = "identity") + labs(fill = "Genitivstellung") + geom_text(aes(label=freq, y = prop), position = position_stack(vjust=0.5), color = "black",size=5) + ylab("") + xlab("Silbenzahl") + scale_y_continuous(breaks =seq(0,1,0.2), labels =scales::percent_format(accuracy = 1,decimal.mark = ',')) + theme(aspect.ratio = 1.1) # change colors and legend labels show(silben <- silben + scale_fill_manual(values = c("post" = "#DCE319FF", "pre" = "#7AD151FF"), labels=c("postnominal","pränominal"))) # create contingency table for no. of syllables by position (absolute numbers) table(prepost$Type, prepost$SyllablesAttr2) # calculate median and mean of no. of syllables median(prepost.test$SyllablesAttr) mean(prepost.test$SyllablesAttr) ############################################################## ## Kopf (2021), fig. 4 (name type of attribute in test set) ## ############################################################## # create contingency table for name type and genitive position (percentages) show(nametype <- prop.table(table(prepost.test$Type, prepost.test$SemNameAttr),2)) # summarize data by name type of attribute and position (pre- or postnominal) data_summary <- prepost.test %>% group_by(SemNameAttr) %>% mutate(n_obs = n()) %>% ungroup() %>% group_by(SemNameAttr, Type, n_obs) %>% summarise(freq = n()) %>% mutate(prop = freq/n_obs) %>% ungroup() %>% mutate(prop.dec = format(prop, decimal.mark = ","))#%>% # je nach Aufbau der Ausgangstabelle: hier Stop # create fig. 4 nametype_data <- ggplot(data_summary, aes(x = SemNameAttr, y = prop, fill = Type)) + geom_bar(stat = "identity") + labs(fill = "Genitivstellung") + geom_text(aes(label=freq, y = prop), position = position_stack(vjust=0.5), color = "black",size=5) + ylab("") + xlab("") + coord_flip() + scale_y_continuous(breaks =seq(0,1,0.2), labels =scales::percent_format(accuracy = 1,decimal.mark = ',')) + scale_x_discrete(labels= c("Personennamen","Namen mensch-\n licher Kollektiva","Toponyme")) + theme(aspect.ratio = 0.6) # change colors and legend labels show(nametype_data <- nametype_data + scale_fill_manual(values = c("post" = "#DCE319FF", "pre" = "#7AD151FF"), labels=c("postnominal","pränominal"))) ################################################### ## Kopf (2021), tab. 6 (name type for all names) ## ################################################### table(genvon_names$NameGrammarAttr, genvon_names$NameAttr) prop.table(table(genvon_names$NameGrammarAttr, genvon_names$NameAttr),2) ######################################################## ## Kopf (2021), fig. 5 (name intimacy in genvon-en) #### ######################################################## # change order of levels so pre comes first in the figure genvon_en$Type <- factor(genvon_en$Type, levels=c("von", "post", "pre")) # create data set kose_data <- droplevels(subset(genvon_en, SyllablesAttr == 2 & PersonNameAttr != "irrelevant" & PersonNameAttr != "title+family" & PersonNameAttr != "title+given")) # summarize data by intimacy of attribute and construction type data_summary <- kose_data %>% group_by(PersonNameAttr) %>% mutate(n_obs = n()) %>% ungroup() %>% group_by(PersonNameAttr, Type, n_obs) %>% summarise(freq = n()) %>% mutate(prop = freq/n_obs) %>% ungroup() %>% mutate(prop.dec = format(prop, decimal.mark = ","))#%>% # je nach Aufbau der Ausgangstabelle: hier Stop # create fig. 5 kose_data <- ggplot(data_summary, aes(x = PersonNameAttr, y = prop, fill = Type)) + geom_bar(stat = "identity") + labs(fill = "") + geom_text(aes(label=freq, y = prop), position = position_stack(vjust=0.5), color = "black",size=5) + ylab("") + xlab("") + coord_flip() + scale_y_continuous(breaks =seq(0,1,0.2), labels =scales::percent_format(accuracy = 1,decimal.mark = ',')) + scale_x_discrete(labels= c("Kosename","Rufname","Familienname", "Ruf- & Familienname")) + theme(aspect.ratio = 0.6, legend.text = ggtext::element_markdown()) # change colors and legend labels show(kose_data <- kose_data + scale_fill_manual(values = c("von"= "#1F968BFF", "post" = "#DCE319FF", "pre" = "#7AD151FF"), labels=c("*von*-Attribut","postnominaler Genitiv","pränominaler Genitiv"))) ################################################################## ## Kopf (2021), tab. 7 (attributes ending in -*s* in genvon-en) ## ################################################################## # create contingency tables for all attributes with stems ending in -*s* by construction type (absolute and relative numbers) show(s_abs <- table(genvon_en$Type, genvon_en$FinalAttr)) show(s_prop <- prop.table(table(genvon_en$Type, genvon_en$FinalAttr),2)) ############################################################### ## Kopf (2021), fig. 6 (name type of attribute in genvon-en) ## ############################################################### # reorder levels for figure genvon_en$Type <- factor(genvon_en$Type, levels=c("von", "post", "pre")) # create contingency table for name type by construction type (relative numbers) show(nametype_von <- prop.table(table(genvon_en$Type, genvon_en$SemNameAttr),2)) # summarize data by name type and construction type data_summary <- genvon_en %>% group_by(SemNameAttr) %>% mutate(n_obs = n()) %>% ungroup() %>% group_by(SemNameAttr, Type, n_obs) %>% summarise(freq = n()) %>% mutate(prop = freq/n_obs) %>% ungroup() %>% mutate(prop.dec = format(prop, decimal.mark = ","))#%>% # je nach Aufbau der Ausgangstabelle: hier Stop # create fig. 6 nametype2_data <- ggplot(data_summary, aes(x = SemNameAttr, y = prop, fill = Type)) + geom_bar(stat = "identity") + labs(fill = "") + geom_text(aes(label=freq, y = prop), position = position_stack(vjust=0.5), color = "black",size=6) + ylab("") + xlab("") + coord_flip() + scale_y_continuous(breaks =seq(0,1,0.2), labels =scales::percent_format(accuracy = 1,decimal.mark = ',')) + scale_x_discrete(labels= c("Personennamen","Namen mensch-\n licher Kollektiva","Toponyme")) + theme(aspect.ratio = 0.6, legend.text = ggtext::element_markdown()) # change colors and legend labels show(nametype2_data <- nametype2_data + scale_fill_manual(values = c("pre" = "#7AD151FF", "post" = "#DCE319FF", "von"= "#1F968BFF"), labels=c("*von*-Attribut","postnominaler Genitiv","pränominaler Genitiv")))