Modelado de tópicos usando BTB
Aplico un modelado de tópicos empleando el paquete
BTM.
Subo la data para este análisis
moderna <- textreadr::read_document("G:/episteme_moderna.txt")
moderna <- textreadr::read_document("G:/episteme_moderna.txt")
En ocasiones la codificación del texto cambia automáticamente a
ASCII y necesito que esté en UTF-8
. Arreglo encondign
moderna <- tau::fixEncoding(moderna)
moderna <- tau::fixEncoding(moderna)
Subo el paquete BTM
library(BTM)
spa_model <- udpipe_download_model(language = "spanish")
spa_model <- udpipe_load_model(spa_model$file_model)
Selecciono el modelo y preparo el marco de datos:
moderna_df <- tibble(id = paste0("tag-",1:length(moderna)),
texto = moderna)
moderna_tag <- udpipe_annotate(spa_model, x = moderna_df$texto,
doc_id = moderna_df$id)
moderna_tag <- as.data.frame(moderna_tag)
stats <- txt_freq(moderna_tag$upos)
stats$key <- factor(stats$key,
levels = rev(stats$key))
Visualizo las partes de la oración capturadas y segmentadas:
stats %>%
ggplot(aes(freq, key))+
geom_col() +
labs(title = "Episteme moderna", subtitle = "UPOS (Universal Parts of Speech)\n según frecuencia", x = "Frecuencia", y= "UPOs")
Selecciono los sustantivos y, posteriormente, los adjetivos:
nombres <- subset(moderna_tag, upos %in% c("NOUN"))
nombres <- txt_freq(nombres$token)
nombres$key <- factor(nombres$key, levels = rev(nombres$key))
nombres[1:10,] %>%
mutate(freq_pct = round(freq_pct,3)) %>%
ggplot(aes(freq_pct, key))+
geom_col() +
labs(title = "Episteme clásica", subtitle = "Nombres según frecuencia relativa", x = "Frecuencia (tanto por cien)", y= "Nombres")
adjetivos <- subset(moderna_tag, upos %in% c("ADJ"))
adejetivos <- txt_freq(adjetivos$token)
adejetivos$key <- factor(adejetivos$key,
levels = rev(adejetivos$key))
adejetivos[1:10,] %>%
mutate(freq_pct = round(freq_pct,4)) %>%
ggplot(aes(freq_pct, key))+
geom_col() +
labs(title = "Episteme moderna", subtitle = "Adjetivos según frecuencia relativa", x = "Frecuencia (tanto por cien)", y= "Adjetivos")
Búsqueda de palabras claves
Aplico el Rapid Automatic Keyword Extraction, RAKE, para extraer palabras claves
clasic_rake <- keywords_rake(x = moderna_tag, term = "lemma",
group = "doc_id",
relevant = moderna_tag$upos %in% c("NOUN",
"ADJ"))
clasic_rake$key <- factor(clasic_rake$keyword,
levels= rev(clasic_rake$keyword))
clasic_rake[c(1:15),] %>%
mutate(rake = round(rake,4)) %>%
ggplot(aes(rake, key))+
geom_col() +
labs(title = "Episteme clásica, 1968-1999", subtitle = "Palabras claves según RAKE", x = "RAKE", y= "Bigrama")
Secuencia de nombres-verbos, verbos-frases
moderna_tag$phrase_tag <- as_phrasemachine(moderna_tag$upos,
type = "upos")
clasic_stats <- keywords_phrases(x = moderna_tag$phrase_tag,
term = tolower(moderna_tag$token),
pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
clasic_stats <- subset(clasic_stats, ngram > 1 & freq > 2)
clasic_stats$key <- factor(clasic_stats$keyword,
levels = rev(clasic_stats$keyword))
clasic_stats %>%
ggplot(aes(freq, key))+
geom_col() +
labs(title = "Episteme moderna",
subtitle = "Palabras claves según coocurrencias frecuentes", x = "Frecuencia", y= "Ngrama")
Coocurrencias
cooc <- cooccurrence(x = subset(moderna_tag, upos %in% c("NOUN",
"ADJ")),
term = "lemma",
group = c("doc_id", "paragraph_id",
"sentence_id"))
wordnetwork <- head(cooc, 30)
wordnetwork <- graph_from_data_frame(wordnetwork)
ggraph(wordnetwork, layout = "fr") +
geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "darkblue") +
geom_node_text(aes(label = name), col = "darkgreen",
size = 4) +
#theme_graph(base_family = "Arial Narrow") +
theme(legend.position = "none") +
labs(title = "Cooccurrencia palabras en la misma oración", subtitle = "Sustantivos y adjetivos")
cooc2 <- cooccurrence(moderna_tag$lemma,
relevant = moderna_tag$upos %in% c("NOUN",
"ADJ"),
skipgram = 1)
wordnetwork2 <- head(cooc2, 30)
wordnetwork2 <- graph_from_data_frame(wordnetwork2)
ggraph(wordnetwork2, layout = "fr") +
geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "violet") +
geom_node_text(aes(label = name), col = "darkgreen",
size = 4) +
#theme_graph(base_family = "Arial Narrow") +
theme(legend.position = "none") +
labs(title = "Cooccurrencia de palabras",
subtitle = "Sustantivos y adjetivos")
Correlaciones
moderna_tag[1:10, c("token_id", "token", "head_token_id", "dep_rel")]
x <- cbind_dependencies(moderna_tag, type = "parent")
dim(x)
[1] 4503 20
nominalsubject <- subset(x, dep_rel %in% c("nsubj"))
dim(nominalsubject)
[1] 163 20
nominalsubject <- nominalsubject[, c("dep_rel", "token",
"token_parent")]
nominalsubject
NA
LS0tDQp0aXRsZTogIkJUQiB0ZXNpcy4gSW50cm9kdWNjaW9uIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KYGBge3IsIGV2YWw9RkFMU0UgfQ0KbW9kZXJuYSA8LSB0ZXh0cmVhZHI6OnJlYWRfZG9jdW1lbnQoIkc6L2xhLWJpYmxpb3RlY2EvamVzdXMvYXB1bnRlcy9lcGlzdGVtZV9tb2Rlcm5hLnR4dCIpDQoNCm1vZGVybmEgPC0gdGV4dHJlYWRyOjpyZWFkX2RvY3VtZW50KCJHOi9sYS1iaWJsaW90ZWNhL2plc3VzL2FwdW50ZXMvZXBpc3RlbWVfbW9kZXJuYS50eHQiKQ0KYGBgDQoNCkFycmVnbG8gZW5jb25kaWduDQpgYGB7cn0NCm1vZGVybmEgPC0gdGF1OjpmaXhFbmNvZGluZyhtb2Rlcm5hKQ0KbW9kZXJuYSA8LSB0YXU6OmZpeEVuY29kaW5nKG1vZGVybmEpDQpgYGANCg0KDQpgYGB7cn0NCmxpYnJhcnkoQlRNKQ0KYGBgDQoNCg0KYGBge3J9DQpzcGFfbW9kZWwgPC0gdWRwaXBlX2Rvd25sb2FkX21vZGVsKGxhbmd1YWdlID0gInNwYW5pc2giKQ0Kc3BhX21vZGVsIDwtIHVkcGlwZV9sb2FkX21vZGVsKHNwYV9tb2RlbCRmaWxlX21vZGVsKQ0KYGBgDQoNCmBgYHtyfQ0KDQptb2Rlcm5hX2RmIDwtIHRpYmJsZShpZCA9IHBhc3RlMCgidGFnLSIsMTpsZW5ndGgobW9kZXJuYSkpLA0KICAgICAgICAgICAgICAgICAgICAgdGV4dG8gPSBtb2Rlcm5hKSANCg0KDQptb2Rlcm5hX3RhZyA8LSB1ZHBpcGVfYW5ub3RhdGUoc3BhX21vZGVsLCB4ID0gbW9kZXJuYV9kZiR0ZXh0bywNCiAgICAgICAgICAgICAgICAgICAgICAgICBkb2NfaWQgPSBtb2Rlcm5hX2RmJGlkKQ0KDQptb2Rlcm5hX3RhZyA8LSBhcy5kYXRhLmZyYW1lKG1vZGVybmFfdGFnKQ0KDQpgYGANCg0KDQpgYGB7ciBlc3RhZGlzdGljYXNCYXNpY2FzLCBldmFsPUZBTFNFfQ0KDQpzdGF0cyA8LSB0eHRfZnJlcShtb2Rlcm5hX3RhZyR1cG9zKQ0KDQpzdGF0cyRrZXkgPC0gZmFjdG9yKHN0YXRzJGtleSwNCiAgICAgICAgICAgICAgICAgICAgbGV2ZWxzID0gcmV2KHN0YXRzJGtleSkpDQpgYGANCg0KDQpgYGB7ciBlc3RzQmFzaWNhcywgY2FjaGU9VFJVRX0NCg0KDQpzdGF0cyAlPiUgDQogICAgICBnZ3Bsb3QoYWVzKGZyZXEsIGtleSkpKw0KICAgICAgZ2VvbV9jb2woKSArDQogICAgICBsYWJzKHRpdGxlID0gIkVwaXN0ZW1lIG1vZGVybmEiLCBzdWJ0aXRsZSA9ICJVUE9TIChVbml2ZXJzYWwgUGFydHMgb2YgU3BlZWNoKVxuIHNlZ8O6biBmcmVjdWVuY2lhIiwgeCA9ICJGcmVjdWVuY2lhIiwgeT0gIlVQT3MiKQ0KDQpgYGANCg0KDQoNCmBgYHtyIG9idGVuZ29Ob21icmVzfQ0KDQpub21icmVzIDwtIHN1YnNldChtb2Rlcm5hX3RhZywgdXBvcyAlaW4lIGMoIk5PVU4iKSkgDQpub21icmVzIDwtIHR4dF9mcmVxKG5vbWJyZXMkdG9rZW4pDQpub21icmVzJGtleSA8LSBmYWN0b3Iobm9tYnJlcyRrZXksIGxldmVscyA9IHJldihub21icmVzJGtleSkpDQoNCg0KYGBgDQoNCg0KYGBge3IgZXN0c0Jhc05vbWJyZXN9DQoNCg0Kbm9tYnJlc1sxOjEwLF0gJT4lIA0KICAgbXV0YXRlKGZyZXFfcGN0ID0gcm91bmQoZnJlcV9wY3QsMykpICU+JSANCiAgICAgIGdncGxvdChhZXMoZnJlcV9wY3QsIGtleSkpKw0KICAgICAgZ2VvbV9jb2woKSArDQogICAgICBsYWJzKHRpdGxlID0gIkVwaXN0ZW1lIG1vZGVybmEiLCBzdWJ0aXRsZSA9ICJOb21icmVzIHNlZ8O6biBmcmVjdWVuY2lhIHJlbGF0aXZhIiwgeCA9ICJGcmVjdWVuY2lhICh0YW50byBwb3IgY2llbikiLCB5PSAiTm9tYnJlcyIpDQoNCmBgYA0KDQoNCmBgYHtyIG9idGVuZ29BZGpldGl2b3N9DQoNCmFkamV0aXZvcyA8LSBzdWJzZXQobW9kZXJuYV90YWcsIHVwb3MgJWluJSBjKCJBREoiKSkgDQphZGVqZXRpdm9zIDwtIHR4dF9mcmVxKGFkamV0aXZvcyR0b2tlbikNCmFkZWpldGl2b3Mka2V5IDwtIGZhY3RvcihhZGVqZXRpdm9zJGtleSwgDQogICAgICAgICAgICAgICAgICAgICAgICAgbGV2ZWxzID0gcmV2KGFkZWpldGl2b3Mka2V5KSkNCg0KDQpgYGANCg0KDQoNCmBgYHtyIGVzdHNCYXNBZGp9DQoNCg0KYWRlamV0aXZvc1sxOjEwLF0gJT4lIA0KICAgbXV0YXRlKGZyZXFfcGN0ID0gcm91bmQoZnJlcV9wY3QsNCkpICU+JSANCiAgICAgIGdncGxvdChhZXMoZnJlcV9wY3QsIGtleSkpKw0KICAgICAgZ2VvbV9jb2woKSArDQogICAgICBsYWJzKHRpdGxlID0gIkVwaXN0ZW1lIG1vZGVybmEiLCBzdWJ0aXRsZSA9ICJBZGpldGl2b3Mgc2Vnw7puIGZyZWN1ZW5jaWEgcmVsYXRpdmEiLCB4ID0gIkZyZWN1ZW5jaWEgKHRhbnRvIHBvciBjaWVuKSIsIHk9ICJBZGpldGl2b3MiKQ0KDQpgYGANCg0KDQoNCg0KIyMgQsO6c3F1ZWRhIGRlIHBhbGFicmFzIGNsYXZlcw0KDQoNClBhbGFicmFzIGNsYXZlcyBkZSBhY3VlcmRvIGNvbiBlbCBSYXBpZCBBdXRvbWF0aWMgS2V5d29yZCBFeHRyYWN0aW9uLCBSQUtFDQoNCg0KYGBge3IgY2xhc2ljUmFrZX0NCg0KY2xhc2ljX3Jha2UgPC0ga2V5d29yZHNfcmFrZSh4ID0gbW9kZXJuYV90YWcsIHRlcm0gPSAibGVtbWEiLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgZ3JvdXAgPSAiZG9jX2lkIiwgDQogICAgICAgICAgICAgICAgICAgICAgIHJlbGV2YW50ID0gbW9kZXJuYV90YWckdXBvcyAlaW4lIGMoIk5PVU4iLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIkFESiIpKQ0KDQpjbGFzaWNfcmFrZSRrZXkgPC0gZmFjdG9yKGNsYXNpY19yYWtlJGtleXdvcmQsIA0KICAgICAgICAgICAgICAgICAgICAgICBsZXZlbHM9IHJldihjbGFzaWNfcmFrZSRrZXl3b3JkKSkNCg0KDQpgYGANCg0KDQoNCmBgYHtyIGNsYXNpY3Jha2UsIGNhY2hlPVRSVUV9DQoNCg0KY2xhc2ljX3Jha2VbYygxOjE1KSxdICU+JSANCiAgIG11dGF0ZShyYWtlID0gcm91bmQocmFrZSw0KSkgJT4lIA0KICAgICAgZ2dwbG90KGFlcyhyYWtlLCBrZXkpKSsNCiAgICAgIGdlb21fY29sKCkgKw0KICAgICAgbGFicyh0aXRsZSA9ICJFcGlzdGVtZSBtb2Rlcm5hIiwgc3VidGl0bGUgPSAiUGFsYWJyYXMgY2xhdmVzIHNlZ8O6biBSQUtFIiwgeCA9ICJSQUtFIiwgeT0gIkJpZ3JhbWEiKQ0KDQpgYGANCg0KDQoNCg0KU2VjdWVuY2lhIGRlIG5vbWJyZXMtdmVyYm9zLCB2ZXJib3MtZnJhc2VzIA0KDQpgYGB7ciBjbGFzaWNDb29jfQ0KDQptb2Rlcm5hX3RhZyRwaHJhc2VfdGFnIDwtIGFzX3BocmFzZW1hY2hpbmUobW9kZXJuYV90YWckdXBvcywgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHlwZSA9ICJ1cG9zIikNCg0KY2xhc2ljX3N0YXRzIDwtIGtleXdvcmRzX3BocmFzZXMoeCA9IG1vZGVybmFfdGFnJHBocmFzZV90YWcsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdGVybSA9IHRvbG93ZXIobW9kZXJuYV90YWckdG9rZW4pLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgcGF0dGVybiA9ICIoQXxOKSpOKFArRCooQXxOKSpOKSoiLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgaXNfcmVnZXggPSBUUlVFLCBkZXRhaWxlZCA9IEZBTFNFKQ0KDQpjbGFzaWNfc3RhdHMgPC0gc3Vic2V0KGNsYXNpY19zdGF0cywgbmdyYW0gPiAxICYgZnJlcSA+IDIpDQoNCmNsYXNpY19zdGF0cyRrZXkgPC0gZmFjdG9yKGNsYXNpY19zdGF0cyRrZXl3b3JkLCANCiAgICAgICAgICAgICAgICAgICAgICAgIGxldmVscyA9IHJldihjbGFzaWNfc3RhdHMka2V5d29yZCkpDQoNCg0KYGBgDQoNCg0KDQpgYGB7ciBtb2Rlcm5hQ29vYywgY2FjaGU9VFJVRX0NCg0KDQpjbGFzaWNfc3RhdHMgJT4lIA0KICAgICAgZ2dwbG90KGFlcyhmcmVxLCBrZXkpKSsNCiAgICAgIGdlb21fY29sKCkgKw0KICAgICAgbGFicyh0aXRsZSA9ICJFcGlzdGVtZSBtb2Rlcm5hIiwgDQogICAgICAgICAgIHN1YnRpdGxlID0gIlBhbGFicmFzIGNsYXZlcyBzZWfDum4gY29vcnJ1Y2VuaWNhcyBmcmVjdWVudGVzIiwgeCA9ICJGcmVjdWVuY2lhIiwgeT0gIk5ncmFtYSIpDQoNCmBgYA0KDQoNCiMjIENvb2N1cnJlbmNpYXMNCg0KYGBge3IgbGFzQ29vY30NCg0KDQoNCmNvb2MgPC0gY29vY2N1cnJlbmNlKHggPSBzdWJzZXQobW9kZXJuYV90YWcsIHVwb3MgJWluJSBjKCJOT1VOIiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJBREoiKSksIA0KICAgICAgICAgICAgICAgICAgICAgdGVybSA9ICJsZW1tYSIsIA0KICAgICAgICAgICAgICAgICAgICAgZ3JvdXAgPSBjKCJkb2NfaWQiLCAicGFyYWdyYXBoX2lkIiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAic2VudGVuY2VfaWQiKSkNCg0KYGBgDQoNCg0KYGBge3IgbGFzQ29vY0csIGNhY2hlPVRSVUV9DQp3b3JkbmV0d29yayA8LSBoZWFkKGNvb2MsIDMwKQ0Kd29yZG5ldHdvcmsgPC0gZ3JhcGhfZnJvbV9kYXRhX2ZyYW1lKHdvcmRuZXR3b3JrKQ0KZ2dyYXBoKHdvcmRuZXR3b3JrLCBsYXlvdXQgPSAiZnIiKSArDQogIGdlb21fZWRnZV9saW5rKGFlcyh3aWR0aCA9IGNvb2MsIGVkZ2VfYWxwaGEgPSBjb29jKSwgZWRnZV9jb2xvdXIgPSAiZGFya2JsdWUiKSArDQogIGdlb21fbm9kZV90ZXh0KGFlcyhsYWJlbCA9IG5hbWUpLCBjb2wgPSAiZGFya2dyZWVuIiwgDQogICAgICAgICAgICAgICAgIHNpemUgPSA0KSArDQogICN0aGVtZV9ncmFwaChiYXNlX2ZhbWlseSA9ICJBcmlhbCBOYXJyb3ciKSArDQogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbiA9ICJub25lIikgKw0KICBsYWJzKHRpdGxlID0gIkNvb2NjdXJyZW5jZGUgcGFsYWJyYXMgZW4gbGEgbWlzbWEgb3JhY2nDs24iLCBzdWJ0aXRsZSA9ICJTdXN0YW50aXZvcyB5IGFkamV0aXZvcyIpDQoNCmBgYA0KDQoNCg0KDQpgYGB7ciBsYXNDb29jaX0NCg0KDQpjb29jMiA8LSBjb29jY3VycmVuY2UobW9kZXJuYV90YWckbGVtbWEsIA0KICAgICAgICAgICAgICAgICAgcmVsZXZhbnQgPSBtb2Rlcm5hX3RhZyR1cG9zICVpbiUgYygiTk9VTiIsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJBREoiKSwgDQogICAgICAgICAgICAgICAgICBza2lwZ3JhbSA9IDEpDQoNCg0KYGBgDQoNCg0KYGBge3IgbGFzQ29vY0dpLCBjYWNoZT1UUlVFfQ0KDQp3b3JkbmV0d29yazIgPC0gaGVhZChjb29jMiwgMzApDQp3b3JkbmV0d29yazIgPC0gZ3JhcGhfZnJvbV9kYXRhX2ZyYW1lKHdvcmRuZXR3b3JrMikNCmdncmFwaCh3b3JkbmV0d29yazIsIGxheW91dCA9ICJmciIpICsNCiAgZ2VvbV9lZGdlX2xpbmsoYWVzKHdpZHRoID0gY29vYywgZWRnZV9hbHBoYSA9IGNvb2MpLCBlZGdlX2NvbG91ciA9ICJ2aW9sZXQiKSArDQogIGdlb21fbm9kZV90ZXh0KGFlcyhsYWJlbCA9IG5hbWUpLCBjb2wgPSAiZGFya2dyZWVuIiwgDQogICAgICAgICAgICAgICAgIHNpemUgPSA0KSArDQogICN0aGVtZV9ncmFwaChiYXNlX2ZhbWlseSA9ICJBcmlhbCBOYXJyb3ciKSArDQogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbiA9ICJub25lIikgKw0KICBsYWJzKHRpdGxlID0gIkNvb2NjdXJyZW5jaWEgZGUgcGFsYWJyYXMiLCANCiAgICAgICBzdWJ0aXRsZSA9ICJTdXN0YW50aXZvcyB5IGFkamV0aXZvcyIpDQoNCmBgYA0KDQoNCiMjIENvcnJlbGFjaW9uZXMNCmBgYHtyLCBjYWNoZT1UUlVFfQ0KDQptb2Rlcm5hX3RhZ1sxOjEwLCBjKCJ0b2tlbl9pZCIsICJ0b2tlbiIsICJoZWFkX3Rva2VuX2lkIiwgImRlcF9yZWwiKV0NCg0KeCA8LSBjYmluZF9kZXBlbmRlbmNpZXMobW9kZXJuYV90YWcsIHR5cGUgPSAicGFyZW50IikNCg0KZGltKHgpDQoNCm5vbWluYWxzdWJqZWN0IDwtIHN1YnNldCh4LCBkZXBfcmVsICVpbiUgYygibnN1YmoiKSkNCmRpbShub21pbmFsc3ViamVjdCkNCg0Kbm9taW5hbHN1YmplY3QgPC0gbm9taW5hbHN1YmplY3RbLCBjKCJkZXBfcmVsIiwgInRva2VuIiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAidG9rZW5fcGFyZW50IildDQpub21pbmFsc3ViamVjdA0KDQpgYGA=
No hay comentarios.:
Publicar un comentario